In [None]:
import pandas as pd
import numpy as np

In [None]:
autism_df = pd.read_csv('autism_screening.csv')
autism_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5.0,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8.0,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6.0,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2.0,18 and more,?,NO


In [None]:
autism_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [None]:
#Replace incorrectly spelled column names
#'Austim' is a feature that describes a history of autism in the family. We will replace it as 'autism_in_family'.
autism_df.rename(columns={'jundice':'jaundice', 'austim':'autism_in_family', 'contry_of_res': 'country_of_res'}, inplace=True)
autism_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'autism_in_family', 'country_of_res',
       'used_app_before', 'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [None]:
#Remove column 'age_desc' from dataset
autism_df.drop(columns='age_desc', inplace=True)
autism_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'autism_in_family', 'country_of_res',
       'used_app_before', 'result', 'relation', 'Class/ASD'],
      dtype='object')

In [None]:
# check for null values
autism_df.isnull().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 2
gender              0
ethnicity           0
jaundice            0
autism_in_family    0
country_of_res      0
used_app_before     0
result              0
relation            0
Class/ASD           0
dtype: int64

In [None]:
#age has 2 missing values. 
print('Max Age: ', autism_df['age'].max())
print('Min Age: ', autism_df['age'].min())

Max Age:  383.0
Min Age:  17.0


In [None]:
# Maximum age value is an extreme outlier. Drop the value.
print(autism_df[(autism_df['age']==383.0)].index)

Int64Index([52], dtype='int64')


In [None]:
# Drop record 52
autism_df.drop(index=52, inplace=True)

#Reset the index
autism_df.reset_index(inplace=True)

In [None]:
# Age has a strange value of 29.1940085592011
autism_df['age'].unique()

array([26., 24., 27., 35., 40., 36., 17., 64., 29., 33., 18., 31., 30.,
       34., 38., 42., 43., 48., 37., 55., 50., 53., 20., 28., 21., 47.,
       32., 44., nan, 19., 58., 45., 22., 39., 25., 23., 54., 60., 41.,
       46., 56., 61., 59., 52., 49., 51.])

In [None]:
# Replace weird value with 29
autism_df['age'] = autism_df['age'].replace(29.1940085592011, 29)

In [None]:
autism_df['age'].unique()

array([26.        , 24.        , 27.        , 35.        , 40.        ,
       36.        , 17.        , 64.        , 29.        , 33.        ,
       18.        , 31.        , 30.        , 34.        , 38.        ,
       42.        , 43.        , 48.        , 37.        , 55.        ,
       50.        , 53.        , 20.        , 28.        , 21.        ,
       47.        , 32.        , 44.        , 29.19400856, 19.        ,
       58.        , 45.        , 22.        , 39.        , 25.        ,
       23.        , 54.        , 60.        , 41.        , 46.        ,
       56.        , 61.        , 59.        , 52.        , 49.        ,
       51.        ])

In [None]:
#Impute missing values in age with the mean of age
autism_df['age'] = autism_df['age'].fillna(autism_df['age'].mean())

In [None]:
# Check that missing values have been replaced
autism_df.isnull().sum()

index               0
A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 0
gender              0
ethnicity           0
jaundice            0
autism_in_family    0
country_of_res      0
used_app_before     0
result              0
relation            0
Class/ASD           0
dtype: int64

In [None]:
# From EDA we know that 'ethnicity' has invalid values
autism_df['ethnicity'].unique()

array(['White-European', 'Latino', '?', 'Others', 'Black', 'Asian',
       'Middle Eastern ', 'Pasifika', 'South Asian', 'Hispanic',
       'Turkish', 'others'], dtype=object)

In [None]:
# Replace '?' and 'others' as 'Others'
autism_df['ethnicity'] = autism_df['ethnicity'].replace('?', 'Others')
autism_df['ethnicity'] = autism_df['ethnicity'].replace('others', 'Others')

In [None]:
# Check ethnicity values
autism_df['ethnicity'].unique()

array(['White-European', 'Latino', 'Others', 'Black', 'Asian',
       'Middle Eastern ', 'Pasifika', 'South Asian', 'Hispanic',
       'Turkish'], dtype=object)

In [None]:
# From EDA we know that 'relation has invalid values
autism_df['relation'].unique()

array(['Self', 'Parent', '?', 'Health care professional', 'Relative',
       'Others'], dtype=object)

In [None]:
# Replace '?' as 'Others'
autism_df['relation'] = autism_df['relation'].replace('?', 'Others')
autism_df['relation'].unique()

array(['Self', 'Parent', 'Others', 'Health care professional', 'Relative'],
      dtype=object)

In [None]:
#drop the index column
autism_df.drop(columns='index', inplace=True)

In [None]:
# Check skewness after removing extreme variable in age
skewed = autism_df.skew(axis = 0, skipna = True)
skewed

  """Entry point for launching an IPython kernel.


A1_Score    -0.988682
A2_Score     0.186115
A3_Score     0.168807
A4_Score     0.014256
A5_Score     0.002851
A6_Score     0.957352
A7_Score     0.332346
A8_Score    -0.630645
A9_Score     0.752161
A10_Score   -0.302755
age          1.036616
result       0.323562
dtype: float64

In [None]:
#Download clean dataset as new csv
from google.colab import files
autism_df.to_csv('autism_df.csv')
files.download('autism_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>