## Task: Given attributes about a person, predict whether their income is <=50K or >50k

In [1]:
import pandas as pd
import numpy as np

## Introduce data

In [2]:
df=pd.read_csv('adult.csv',na_values=['#NAME?'])  # change '#NAME?' to NaN

In [3]:
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [4]:
df.shape

(5000, 15)

In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,4952.0,4893.0,4943.0,5000.0,5000.0,5000.0
mean,38.58643,190975.2,10.080316,1033.6402,93.6968,40.519
std,13.582256,106574.7,2.535268,7051.802077,410.801418,12.109193
min,17.0,19302.0,1.0,0.0,0.0,1.0
25%,28.0,117747.0,9.0,0.0,0.0,40.0
50%,37.0,179533.0,10.0,0.0,0.0,40.0
75%,47.0,241895.0,12.0,0.0,0.0,45.0
max,90.0,1033222.0,16.0,99999.0,2547.0,99.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
age               4952 non-null float64
workclass         5000 non-null object
fnlwgt            4893 non-null float64
education         5000 non-null object
education_num     4943 non-null float64
marital_status    5000 non-null object
occupation        5000 non-null object
relationship      5000 non-null object
race              4736 non-null object
sex               4953 non-null object
capital_gain      5000 non-null int64
capital_loss      5000 non-null int64
hours_per_week    5000 non-null int64
native_country    5000 non-null object
income            5000 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 586.1+ KB


In [7]:
df['income'].value_counts()

<=50K    3779
>50K     1221
Name: income, dtype: int64

## Data cleaning

### A. Handling missing data 
+ Models cannot handle missing data
+ Simplist solution
   + Remove samples/features that have missing data
+ But that will introduce many problems
   + Data is ramdomly missing: potentially lose a lot of data
   + Data is non-randomly missing: lose a lot of data + create bias
   + Usually, this is a poor solution
+ An better solution: imputation
   + Replace missing value with another value
   + Strategies: mean, median, higest freq category

In [8]:
# How much missing data?
df.isnull().sum().sort_values(ascending=False).head()

race             264
fnlwgt           107
education_num     57
age               48
sex               47
dtype: int64

In [9]:
# Replace categorical missing values by the highest freq category
df['race'].value_counts(dropna=False).sort_values(ascending=False)

White                 4021
Black                  493
NaN                    264
Asian-Pac-Islander     145
Amer-Indian-Eskimo      48
Other                   29
Name: race, dtype: int64

In [10]:
df['race'].fillna(value='White', inplace=True) 
df['race'].value_counts(dropna=False).sort_values(ascending=False)

White                 4285
Black                  493
Asian-Pac-Islander     145
Amer-Indian-Eskimo      48
Other                   29
Name: race, dtype: int64

In [11]:
# Replace categorical missing values by the highest freq category
df['sex'].value_counts(dropna=False).sort_values(ascending=False)

Male      3332
Female    1621
NaN         47
Name: sex, dtype: int64

In [12]:
df['sex'].fillna(value='Male', inplace=True) 
df['sex'].value_counts(dropna=False).sort_values(ascending=False)

Male      3379
Female    1621
Name: sex, dtype: int64

In [13]:
# Replace numerical missing values by the median
median_age = df['age'].median()
median_age

37.0

In [14]:
df['age']=df['age'].fillna(median_age)

In [15]:
# Replace numerical missing values by the median
median_fnlwgt = df['fnlwgt'].median()
median_fnlwgt

179533.0

In [16]:
df['fnlwgt']=df['fnlwgt'].fillna(median_fnlwgt)

In [17]:
# Replace numerical missing values by the median
median_education_num = df['education_num'].median()
median_education_num

10.0

In [18]:
df['education_num']=df['education_num'].fillna(median_education_num)

In [19]:
# Check missing data again
df.isnull().sum().sort_values(ascending=False).head()

income            0
native_country    0
hours_per_week    0
capital_loss      0
capital_gain      0
dtype: int64

### B. Dealing with data types
+ There are 3 main data types:
   + Numeric, eg: income, age
   + Categorica, eg: gender, nationality
   + Oridnal, eg: low/medium/high
+ Models can only handle numeric features

In [20]:
df['education'].head()

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: object

In [21]:
# use get_dummies in pandas or use OneHotEncoder in sklearn
pd.get_dummies(df['education'].head(5))

Unnamed: 0,11th,Bachelors,HS-grad
0,0,1,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


In [22]:
# Decide which categorical features you want to use in the model
# if a feature has to many categories (eg: 99 categories in 100 samples) -> useless for the model
for col_name in df.columns:
    if df[col_name].dtypes == 'object':
        unique_categories = len(df[col_name].unique())
        print("Feature '{col_name}' has '{unique_categories}' unique categories".format(col_name=col_name, unique_categories=unique_categories))

Feature 'workclass' has '8' unique categories
Feature 'education' has '17' unique categories
Feature 'marital_status' has '7' unique categories
Feature 'occupation' has '15' unique categories
Feature 'relationship' has '6' unique categories
Feature 'race' has '5' unique categories
Feature 'sex' has '2' unique categories
Feature 'native_country' has '40' unique categories
Feature 'income' has '2' unique categories


In [23]:
# 'country' has many unique categories, most categories have few samples
df['native_country'].value_counts().sort_values(ascending=False)

United-States                 4465
Mexico                         104
?                               97
Canada                          28
Philippines                     22
Germany                         22
El-Salvador                     16
England                         16
Puerto-Rico                     16
Jamaica                         15
China                           15
Cuba                            14
Dominican-Republic              13
India                           12
Italy                           12
Iran                            11
Taiwan                          10
Guatemala                       10
Vietnam                         10
Japan                            9
Poland                           9
South                            9
Portugal                         7
Greece                           6
Haiti                            6
Nicaragua                        5
Columbia                         5
Ecuador                          4
Trinadad&Tobago     

In [24]:
# combine other categories (except US) as 'Other'
df['native_country']=['United-States' if x == 'United-States' else 'Other' for x in df['native_country'] ]

In [25]:
df['native_country'].value_counts().sort_values(ascending=False)

United-States    4465
Other             535
Name: native_country, dtype: int64

In [26]:
# Create a list of features to dummy
to_dummy_list = ['workclass','education','marital_status','occupation','relationship','race','sex','native_country'],

In [27]:
# Function to dummy for all categorical features
def dummy_df(df, to_dummy_list):
    for x in to_dummy_list:
        dummies = pd.get_dummies(df[x],prefix=x,dummy_na=False)
        df=df.drop(x,1)
        df=pd.concat([df,dummies],axis=1)
    return df

In [29]:
df = dummy_df(df,to_dummy_list)
df.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Other,native_country_United-States
0,39.0,77516.0,13.0,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,50.0,83311.0,13.0,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,38.0,215646.0,9.0,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,1,0,1,0,1
3,53.0,234721.0,7.0,0,0,40,<=50K,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,28.0,338409.0,13.0,0,0,40,<=50K,0,0,0,...,1,0,0,1,0,0,1,0,1,0


### C. Detecting outliers
+ An outlier is an observation that deviates drasticallu from other observations in a dataset
+ Occurrence:
   + Natural: Jack Ma's income
   + Error: a man weighs 1000kg due to mistyping extra 0
+ Problematic:
   + Natural: 
      + No necessary problematic
      + But can skew your model by affecting the slope
   + Error:
      + Indicative of data quality issues
      + Treat in the same way as a missing value, i.e. use imputation
+ Method:
   + Tukey IQR
   + Kernel density estimation