In [11]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut, RepeatedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [2]:
df= pd.read_csv('income_evaluation.csv', na_values= ' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [4]:
df.fillna('missing', inplace= True)

In [5]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [6]:
X= df.drop(' income', axis=1)
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [7]:
y= df[' income']
y

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name:  income, Length: 32561, dtype: object

In [8]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

In [9]:
X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   train_size= 0.8, random_state= 0)

#KFold

In [10]:
kf= KFold(n_splits=5)

In [12]:
i=1
for train_set, test_set in kf.split(X_train):
    print('iteration', i)
    print(train_set, 'having length', len(train_set))
    print(test_set, 'having length', len(test_set))
    i+=1

iteration 1
[ 5210  5211  5212 ... 26045 26046 26047] having length 20838
[   0    1    2 ... 5207 5208 5209] having length 5210
iteration 2
[    0     1     2 ... 26045 26046 26047] having length 20838
[ 5210  5211  5212 ... 10417 10418 10419] having length 5210
iteration 3
[    0     1     2 ... 26045 26046 26047] having length 20838
[10420 10421 10422 ... 15627 15628 15629] having length 5210
iteration 4
[    0     1     2 ... 26045 26046 26047] having length 20839
[15630 15631 15632 ... 20836 20837 20838] having length 5209
iteration 5
[    0     1     2 ... 20836 20837 20838] having length 20839
[20839 20840 20841 ... 26045 26046 26047] having length 5209


In [13]:
num_cols= X_train.select_dtypes(include= np.number).columns
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [14]:
cat_cols= X_train.select_dtypes(exclude= np.number).columns
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [15]:
ct= ColumnTransformer([
    ('num_cols', RobustScaler(), num_cols),
    ('cat_cols', OneHotEncoder(sparse= False, handle_unknown= 'ignore'), cat_cols)
])

In [16]:
pipe= Pipeline([
    ('preprocessing', ct),
    ('model', RandomForestClassifier(n_estimators= 10, random_state= 0))
])

In [17]:
scores= []
i= 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y[train_set])
    sco= pipe.score(X.loc[test_set], y[test_set])
    scores.append(sco)
    print("Iteraiton ", i)
    i+=1
scores

Iteraiton  1
Iteraiton  2
Iteraiton  3
Iteraiton  4
Iteraiton  5


[0.8478427759864886,
 0.8452088452088452,
 0.8461302211302212,
 0.8485872235872236,
 0.8516584766584766]

In [18]:
# Stratified KFold

In [19]:
skf= StratifiedKFold()

In [20]:
scores_skf= []
i= 1
for train_set, test_set in skf.split(X=X, y=y):
    pipe.fit(X.loc[train_set], y[train_set])
    sco= pipe.score(X.loc[test_set], y[test_set])
    scores_skf.append(sco)
    print("Iteraiton ", i)
    i+=1
scores

Iteraiton  1
Iteraiton  2
Iteraiton  3
Iteraiton  4
Iteraiton  5


[0.8478427759864886,
 0.8452088452088452,
 0.8461302211302212,
 0.8485872235872236,
 0.8516584766584766]

In [26]:
start= time.time()
result_skf= cross_val_score(estimator= pipe, X= X_train, y= y_train,
                                   scoring= 'accuracy', cv= StratifiedKFold(n_splits= 5))
print('Time Taken=', time.time()- start)

Time Taken= 3.2351434230804443


In [27]:
result_skf

array([0.85143954, 0.85259117, 0.85163148, 0.84891534, 0.85141102])

In [28]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_cols',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='obje...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        

In [29]:
pipe.score(X_test, y_test)

0.8446184553968985

In [None]:
start= time.time()
result_lkoo= cross_val_score(estimator= pipe, X= X_train, y= y_train,
                                   scoring= 'accuracy', cv= RepeatedKFold(n_splits= 5, n_repeats= 10))
print('Time Taken=', time.time()- start)

In [None]:
pipe.fit(X_)