### Background

## 1. Set up drive

In [2]:
# Mount Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
cd "gdrive/MyDrive/Projects/1 - Numericals/Autism Prediction Challenge/2 - Production/data"

/content/gdrive/MyDrive/Projects/1 - Numericals/Autism Prediction Challenge/2 - Production/data


## 2. Import Libraries

In [171]:
# Load data
import pandas as pd
import numpy as np
import io
import os
import glob

# Meta
import time

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sb

# Analysis
from scipy.stats import zscore

## 3. Data Assessment Class

## 3. Load Data

In [172]:
ls

[0m[01;34mAutism-prediction[0m/     sample_submission.csv  train.csv
autism-prediction.zip  test.csv


In [173]:
df = pd.read_csv('train.csv')

## 4. Clean Data

1. Age - It has at least 6 decimal points - Reduce it to 1 decimal point


Code:

In [174]:
df['age'] = df['age'].round(2) 

Test:

In [175]:
df['age']

0      18.61
1      13.83
2      14.68
3      61.04
4      14.26
       ...  
795    42.08
796    17.67
797    18.24
798    19.24
799    32.17
Name: age, Length: 800, dtype: float64

2. Result - The result column is of float type - Ensure result is rounded to 6 decimal places

Code:

In [176]:
df['result'] = df['result'].round(6) 

Test:

In [177]:
df['result']

0       7.819715
1      10.544296
2      13.167506
3       1.530098
4       7.949723
         ...    
795    13.390868
796     9.454201
797     6.805509
798     3.682732
799    12.060168
Name: result, Length: 800, dtype: float64

3 - Impute misisng values in ethnicity and relation features

Code:

In [178]:
# Replace ? with the word 'Unknown' in the dataframe

df = df.replace(to_replace = '?', value = 'Unknown')

Test:

In [179]:
df[df['ethnicity'] == '?']

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD


In [180]:
df[df['relation'] == '?']

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD


4 - The 'others' value in Ethnicity is in lower case. This needs to be changed to title case

Code:

In [181]:
df['ethnicity'] = df['ethnicity'].str.title()

Test:

In [182]:
df['ethnicity'].str.istitle().unique()

array([ True])

5 - Change column names

Code:

In [183]:
# The ‘austim’ column is not descriptive and there is a typo error - Change it to ‘fam_history_autism’
df = df.rename(columns={"austim":"fam_history_autism"})

In [184]:
# The ‘contry_of_res’ column is not descriptive and there is a typo error - Change it to ‘Country’
df = df.rename(columns={"contry_of_res":"country"})

In [185]:
# Non descriptive column header result  - Change it to ‘final_score’
df = df.rename(columns={"result":"final_score"})

In [186]:
# Non descriptive column header relation  - Change it to ‘survey_done_by’
df = df.rename(columns={"relation":"survey_done_by"})

Test:

In [187]:
df.columns

Index(['ID', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
       'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age',
       'gender', 'ethnicity', 'jaundice', 'fam_history_autism', 'country',
       'used_app_before', 'final_score', 'age_desc', 'survey_done_by',
       'Class/ASD'],
      dtype='object')

6 - The AQ1-10 screening test recommended for adults without moderate or severe learning disability. Hence this model is useful only for adults. Also, the participants in the dataset are also adults - Drop age_desc feature

Code:

In [188]:
df = df.drop(['age_desc'], axis = 1)

Test:

In [189]:
df.columns

Index(['ID', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
       'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age',
       'gender', 'ethnicity', 'jaundice', 'fam_history_autism', 'country',
       'used_app_before', 'final_score', 'survey_done_by', 'Class/ASD'],
      dtype='object')

7 - Save data to csv

In [190]:
df.to_csv('l1_clean_df.csv')
