<a href="https://colab.research.google.com/github/Melvinmcrn/DataScience/blob/master/ML1_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier

In [2]:
columns_name = ['age','workclass','fnlwgt','education','education-num','maritial-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=columns_name)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,maritial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                32561 non-null int64
workclass          32561 non-null object
fnlwgt             32561 non-null int64
education          32561 non-null object
education-num      32561 non-null int64
maritial-status    32561 non-null object
occupation         32561 non-null object
relationship       32561 non-null object
race               32561 non-null object
sex                32561 non-null object
capital-gain       32561 non-null int64
capital-loss       32561 non-null int64
hours-per-week     32561 non-null int64
native-country     32561 non-null object
salary             32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


# Data Preparation

First, strip every data in the dataframe because every data start with a space.

In [5]:
df_obj = data.select_dtypes(['object'])
data[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
data['sex'][0]

'Male'

Set '?' value to null so we can deal with NaN easily later.

In [0]:
def CheckQuestionValue():
  for c in data.columns:
    if len(data.loc[data[c] == '?']):
      print(c)

In [7]:
CheckQuestionValue()

workclass
occupation
native-country


  result = method(y)


In [8]:
mapping_dict ={'?':np.nan}
data = data.replace(mapping_dict)
CheckQuestionValue()

  result = method(y)


Change sex data to (0 for Male) and (1 for Female)

In [9]:
mapping_dict = { 'sex': {'Male':0, 'Female':1}}
cleaned_data = data.replace(mapping_dict)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                32561 non-null int64
workclass          30725 non-null object
fnlwgt             32561 non-null int64
education          32561 non-null object
education-num      32561 non-null int64
maritial-status    32561 non-null object
occupation         30718 non-null object
relationship       32561 non-null object
race               32561 non-null object
sex                32561 non-null int64
capital-gain       32561 non-null int64
capital-loss       32561 non-null int64
hours-per-week     32561 non-null int64
native-country     31978 non-null object
salary             32561 non-null object
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


Change target value to 0 (>50K) and 1 (<=50K)

In [10]:
mapping_dict = { 'salary': {'>50K':0, '<=50K':1}}
cleaned_data = cleaned_data.replace(mapping_dict)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                32561 non-null int64
workclass          30725 non-null object
fnlwgt             32561 non-null int64
education          32561 non-null object
education-num      32561 non-null int64
maritial-status    32561 non-null object
occupation         30718 non-null object
relationship       32561 non-null object
race               32561 non-null object
sex                32561 non-null int64
capital-gain       32561 non-null int64
capital-loss       32561 non-null int64
hours-per-week     32561 non-null int64
native-country     31978 non-null object
salary             32561 non-null int64
dtypes: int64(8), object(7)
memory usage: 3.7+ MB


Deal with cat data and make them numeric

In [11]:
cleaned_data['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [12]:
dummy = pd.get_dummies(cleaned_data['workclass'], drop_first=True)
dummy.head(5)

Unnamed: 0,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0


In [13]:
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['workclass'], axis=1)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 21 columns):
age                 32561 non-null int64
fnlwgt              32561 non-null int64
education           32561 non-null object
education-num       32561 non-null int64
maritial-status     32561 non-null object
occupation          30718 non-null object
relationship        32561 non-null object
race                32561 non-null object
sex                 32561 non-null int64
capital-gain        32561 non-null int64
capital-loss        32561 non-null int64
hours-per-week      32561 non-null int64
native-country      31978 non-null object
salary              32561 non-null int64
Local-gov           32561 non-null uint8
Never-worked        32561 non-null uint8
Private             32561 non-null uint8
Self-emp-inc        32561 non-null uint8
Self-emp-not-inc    32561 non-null uint8
State-gov           32561 non-null uint8
Without-pay         32561 non-null uint8
dtypes: int64(8), object

In [14]:
cleaned_data['education'].value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [0]:
dummy = pd.get_dummies(cleaned_data['education'], drop_first=True)
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['education'], axis=1)

In [16]:
cleaned_data['maritial-status'].value_counts()

Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: maritial-status, dtype: int64

In [0]:
dummy = pd.get_dummies(cleaned_data['maritial-status'], drop_first=True)
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['maritial-status'], axis=1)

In [18]:
cleaned_data['occupation'].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [0]:
dummy = pd.get_dummies(cleaned_data['occupation'], drop_first=True)
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['occupation'], axis=1)

In [20]:
cleaned_data['relationship'].value_counts()

Husband           13193
Not-in-family      8305
Own-child          5068
Unmarried          3446
Wife               1568
Other-relative      981
Name: relationship, dtype: int64

In [0]:
dummy = pd.get_dummies(cleaned_data['relationship'], drop_first=True)
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['relationship'], axis=1)

In [22]:
cleaned_data['race'].value_counts()

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [0]:
dummy = pd.get_dummies(cleaned_data['race'], drop_first=True)
cleaned_data = pd.concat([cleaned_data, dummy], axis=1)
cleaned_data = cleaned_data.drop(['race'], axis=1)

In [24]:
cleaned_data['native-country'].value_counts()

United-States                 29170
Mexico                          643
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                           29
France                      

In [0]:
#drop native-country because there are too many value
cleaned_data = cleaned_data.drop(['native-country'], axis=1)

In [26]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 58 columns):
age                      32561 non-null int64
fnlwgt                   32561 non-null int64
education-num            32561 non-null int64
sex                      32561 non-null int64
capital-gain             32561 non-null int64
capital-loss             32561 non-null int64
hours-per-week           32561 non-null int64
salary                   32561 non-null int64
Local-gov                32561 non-null uint8
Never-worked             32561 non-null uint8
Private                  32561 non-null uint8
Self-emp-inc             32561 non-null uint8
Self-emp-not-inc         32561 non-null uint8
State-gov                32561 non-null uint8
Without-pay              32561 non-null uint8
11th                     32561 non-null uint8
12th                     32561 non-null uint8
1st-4th                  32561 non-null uint8
5th-6th                  32561 non-null uint8
7th-8th          

In [27]:
cleaned_data.isnull().sum()

age                      0
fnlwgt                   0
education-num            0
sex                      0
capital-gain             0
capital-loss             0
hours-per-week           0
salary                   0
Local-gov                0
Never-worked             0
Private                  0
Self-emp-inc             0
Self-emp-not-inc         0
State-gov                0
Without-pay              0
11th                     0
12th                     0
1st-4th                  0
5th-6th                  0
7th-8th                  0
9th                      0
Assoc-acdm               0
Assoc-voc                0
Bachelors                0
Doctorate                0
HS-grad                  0
Masters                  0
Preschool                0
Prof-school              0
Some-college             0
Married-AF-spouse        0
Married-civ-spouse       0
Married-spouse-absent    0
Never-married            0
Separated                0
Widowed                  0
Armed-Forces             0
C

In [28]:
nonnull_data = cleaned_data.dropna()
nonnull_data.isnull().sum()

age                      0
fnlwgt                   0
education-num            0
sex                      0
capital-gain             0
capital-loss             0
hours-per-week           0
salary                   0
Local-gov                0
Never-worked             0
Private                  0
Self-emp-inc             0
Self-emp-not-inc         0
State-gov                0
Without-pay              0
11th                     0
12th                     0
1st-4th                  0
5th-6th                  0
7th-8th                  0
9th                      0
Assoc-acdm               0
Assoc-voc                0
Bachelors                0
Doctorate                0
HS-grad                  0
Masters                  0
Preschool                0
Prof-school              0
Some-college             0
Married-AF-spouse        0
Married-civ-spouse       0
Married-spouse-absent    0
Never-married            0
Separated                0
Widowed                  0
Armed-Forces             0
C

In [29]:
nonnull_data.shape

(32561, 58)

In [0]:
data = nonnull_data

# Divide data into train/test data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('salary', axis=1), data['salary'], test_size=0.3, stratify=data['salary'], random_state=1234)

# Start training

## RandomForest

In [0]:
parameters = {'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10], 'max_depth':[None, 1,2,3,4,5,6,7,8,9,10], 'criterion':['gini','entropy'], 'min_samples_split':[2,3,4,5,6,7,8,9,10]}

In [53]:
clf = RandomizedSearchCV(RandomForestClassifier(random_state=1234), parameters, random_state=1234)
clf.fit(X_train, y_train)
clf.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 10,
 'min_samples_split': 8}

In [0]:
clf = GridSearchCV(RandomForestClassifier(), parameters)
clf.fit(X_train, y_train)
clf.best_params_