In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import scipy as sp
import matplotlib as mpl
import seaborn as sns

# Setting up Pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

# Setting up Seaborn
sns.set_style("whitegrid")
#sns.set_context("poster")

%matplotlib inline

In [22]:
train = pd.read_csv('training_data.csv')
test = pd.read_csv('test_data.csv')

In [23]:
training = train.drop('status_group', axis=1)

In [24]:
training = training.drop('Unnamed: 0', axis=1)

test = test.drop('Unnamed: 0', axis=1)

In [8]:
training.shape

(59400, 21)

In [14]:
training.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group
0,6000.0,995,other,1390,other,Lake Nyasa,Mnyusi B,109,True,vwc,Roman,False,90s,gravity,annually,soft,enough,spring,groundwater,communal standpipe,communal standpipe
1,0.0,272,other,1399,other,Lake Victoria,Nyamara,280,Unknown,other,Others,True,10s,gravity,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,communal standpipe
2,25.0,281,other,686,other,Pangani,Majengo,250,True,vwc,Nyumba ya mungu pipe scheme,True,00s,gravity,per bucket,soft,enough,dam,surface,communal standpipe multiple,communal standpipe
3,0.0,309,Unicef,263,other,Ruvuma / Southern Coast,Mahakamani,58,True,vwc,Others,True,80s,submersible,never pay,soft,dry,borehole,groundwater,communal standpipe multiple,communal standpipe
4,0.0,874,other,0,other,Lake Victoria,Kyanyamisa,0,True,other,Others,True,unknown,gravity,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,communal standpipe


In [16]:
training['amount_tsh'].value_counts()

0.00         41639
500.00        3102
50.00         2472
1000.00       1488
20.00         1463
200.00        1220
100.00         816
10.00          806
30.00          743
2000.00        704
250.00         569
300.00         557
5000.00        450
5.00           376
25.00          356
3000.00        334
1200.00        267
1500.00        197
6.00           190
600.00         176
4000.00        156
2400.00        145
2500.00        139
6000.00        125
7.00            69
8000.00         61
750.00          59
40.00           59
10000.00        57
12000.00        51
             ...  
16000.00         2
16300.00         2
800.00           2
11000.00         2
590.00           2
520.00           2
26000.00         2
13000.00         2
9.00             1
170000.00        1
200000.00        1
900.00           1
0.25             1
53.00            1
350000.00        1
306.00           1
120000.00        1
26.00            1
59.00            1
60000.00         1
5400.00          1
70000.00    

In [7]:
test.shape

(14850, 21)

In [11]:
##Plot function for Confusion Matrix

#plt.rcParams['figure.figsize'] = (6.0, 6.0)
from sklearn.metrics import confusion_matrix
labels=['functional','functional needs repair','non functional']
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(shrink=0.7)
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45, ha='right', fontsize=12)
    plt.yticks(tick_marks, labels , fontsize=12)
    plt.tight_layout()
    plt.ylabel('True label', fontsize=12)
    plt.xlabel('Predicted label', fontsize=12)

# Because we are having maximum amount of Categorical data let us transform the features, and convert them into list

In [12]:
def transform_feature(df, column_name):
    unique_values = set(df[column_name].tolist())
    transformer_dict = {}
    for index, value in enumerate(unique_values):
        transformer_dict[value] = index
    df[column_name] = df[column_name].apply(lambda y: transformer_dict[y])
    return df

In [17]:
ll=set(training['funder'].tolist())
print(ll)

{'rwssp', 'other', 'hesawa', 'world_bank', 'Kkkt', 'danida', 'District Council', 'Tasaf', 'gov', 'Unicef', 'World Vision'}


In [18]:
transform_dict={}

In [19]:
for index,value in enumerate(ll):
    print(index,value)
                        

0 rwssp
1 other
2 hesawa
3 world_bank
4 Kkkt
5 danida
6 District Council
7 Tasaf
8 gov
9 Unicef
10 World Vision


In [13]:
training.columns

Index(['amount_tsh', 'days_since_recorded', 'funder', 'gps_height', 'installer', 'basin', 'subvillage', 'population', 'public_meeting', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'payment_type', 'water_quality', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group'], dtype='object')

In [20]:
integer_columns = ['days_since_recorded', 'population','gps_height'] 
columns_to_transform = [col for col in training.columns if col not in integer_columns]
for column in columns_to_transform: 
    training = transform_feature(training, column)
    test = transform_feature(test, column)

In [21]:
training.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group
0,77,995,1,1390,1,2,18779,109,1,7,1182,2,6,5,4,3,2,2,2,6,5
1,0,272,1,1399,1,8,12064,280,0,0,248,1,5,5,0,3,4,1,1,6,5
2,15,281,1,686,1,0,9296,250,1,7,926,1,2,5,6,3,2,6,1,3,5
3,0,309,9,263,1,3,11452,58,1,7,248,1,3,7,0,3,1,5,2,3,5
4,0,874,1,0,1,8,4317,0,1,0,248,1,0,5,0,3,3,1,1,6,5


# LabelEncoder also does the same thing

In [33]:
select=[cols for cols in training.columns if cols not in integer_columns]
select

['amount_tsh',
 'funder',
 'installer',
 'basin',
 'subvillage',
 'public_meeting',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'payment_type',
 'water_quality',
 'quantity_group',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [36]:
from sklearn.preprocessing import LabelEncoder
training[select]=training[select].apply(LabelEncoder().fit_transform)

In [37]:
training.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group
0,62,995,8,1390,7,1,11807,109,1,6,2246,0,5,1,0,6,1,6,0,1,1
1,0,272,8,1399,7,4,15838,280,2,2,2171,1,1,1,2,6,2,3,1,1,1
2,13,281,8,686,7,5,9074,250,1,6,2121,1,0,1,5,6,1,1,1,2,1
3,0,309,3,263,7,7,8982,58,1,6,2171,1,4,8,2,6,0,0,0,2,1
4,0,874,8,0,7,4,7698,0,1,2,2171,1,6,1,2,6,3,3,1,1,1


In [72]:
test.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.0,302,other,1996,other,Internal,Magoma,321,True,Parastatal,,True,10s,other,never pay,soft,seasonal,rainwater harvesting,surface,other,other
1,0.0,302,gov,1569,dwe,Pangani,Kimnyak,300,True,vwc,TPRI pipe line,True,00s,gravity,never pay,soft,insufficient,spring,groundwater,communal standpipe,communal standpipe
2,0.0,305,other,1567,other,Internal,Msatu,500,True,vwc,P,Unknown,10s,other,never pay,soft,insufficient,rainwater harvesting,surface,other,other
3,0.0,315,other,267,other,Ruvuma / Southern Coast,Kipindimbi,250,Unknown,vwc,,True,80s,other,unknown,soft,dry,shallow well,groundwater,other,other
4,500.0,251,other,1260,other,Ruvuma / Southern Coast,Losonga,60,Unknown,wtr_brd,BRUDER,True,00s,gravity,monthly,soft,enough,spring,groundwater,communal standpipe,communal standpipe


In [73]:
from sklearn.preprocessing import LabelEncoder
test[select]=test[select].apply(LabelEncoder().fit_transform)

TypeError: ("'>' not supported between instances of 'str' and 'float'", 'occurred at index subvillage')

In [38]:
## Converting the Training dataframe into a matrix and predictor as y 
X = training.as_matrix()
y = train["status_group"].tolist()

In [39]:
X

array([[  62,  995,    8, ...,    0,    1,    1],
       [   0,  272,    8, ...,    1,    1,    1],
       [  13,  281,    8, ...,    1,    2,    1],
       ..., 
       [   0,  967,    8, ...,    0,    4,    3],
       [   0, 1001,    8, ...,    0,    4,    3],
       [   0,  986,   10, ...,    0,    4,    3]], dtype=int64)

In [40]:
import sklearn.model_selection 
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, 
                                                                             y, 
                                                                             test_size = 0.3, 
                                                                             random_state = 0)

In [41]:
import sklearn.ensemble

rfc = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, 
                                              min_samples_split=6,
                                              criterion='gini', 
                                              max_features='auto',
                                              oob_score=True,
                                              random_state=1,
                                              n_jobs=-1)

In [42]:
rfc.fit(X_train, y_train)

print('Random Forest Classifier Train Accuracy Score :', rfc.score(X_train, y_train))
print('Random Forest Classifier Test Score :', rfc.score(X_test, y_test))

Random Forest Classifier Train Accuracy Score : 0.938504088504
Random Forest Classifier Test Score : 0.804994388328


In [62]:
importances=list(zip(rfc.feature_importances_,training.columns))

In [63]:
importances.sort(reverse=True)

In [64]:
importances

[(0.15597222796778859, 'quantity_group'),
 (0.10315411867437624, 'days_since_recorded'),
 (0.099319325593114377, 'subvillage'),
 (0.077244034567599235, 'gps_height'),
 (0.057728222512099733, 'waterpoint_type'),
 (0.052747969638942328, 'population'),
 (0.050438965815730326, 'scheme_name'),
 (0.046020721013466492, 'waterpoint_type_group'),
 (0.040774628584932707, 'extraction_type'),
 (0.038011500051715878, 'construction_year'),
 (0.036073540144005996, 'amount_tsh'),
 (0.035948969233674501, 'basin'),
 (0.033284426337876316, 'payment_type'),
 (0.031822624506119911, 'scheme_management'),
 (0.031289319935021549, 'funder'),
 (0.026247125779416222, 'installer'),
 (0.026230609532461091, 'source_type'),
 (0.01850966324951276, 'water_quality'),
 (0.01652040398799736, 'permit'),
 (0.013661966885625293, 'public_meeting'),
 (0.0089996359885231806, 'source_class')]

In [50]:
#Using StandardScaler
from sklearn.preprocessing import StandardScaler

In [53]:
ss=StandardScaler()

In [54]:
X=ss.fit_transform(training)

In [55]:
X

array([[ 3.35282034,  1.14113571,  0.50520616, ..., -0.53841035,
        -0.8836145 , -0.76596738],
       [-0.55432861, -1.02215173,  0.50520616, ...,  1.76507387,
        -0.8836145 , -0.76596738],
       [ 0.2649123 , -0.99522284,  0.50520616, ...,  1.76507387,
        -0.32226541, -0.76596738],
       ..., 
       [-0.55432861,  1.05735695,  0.50520616, ..., -0.53841035,
         0.80043277,  0.68443707],
       [-0.55432861,  1.15908831,  0.50520616, ..., -0.53841035,
         0.80043277,  0.68443707],
       [-0.55432861,  1.11420682,  1.55956798, ..., -0.53841035,
         0.80043277,  0.68443707]])

In [56]:
import sklearn.model_selection 
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, 
                                                                             y, 
                                                                             test_size = 0.3, 
                                                                             random_state = 0)

In [57]:
import sklearn.ensemble

rfc = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, 
                                              min_samples_split=6,
                                              criterion='gini', 
                                              max_features='auto',
                                              oob_score=True,
                                              random_state=1,
                                              n_jobs=-1)

In [58]:
rfc.fit(X_train, y_train)

print('Random Forest Classifier Train Accuracy Score :', rfc.score(X_train, y_train))
print('Random Forest Classifier Test Score :', rfc.score(X_test, y_test))

Random Forest Classifier Train Accuracy Score : 0.938552188552
Random Forest Classifier Test Score : 0.805050505051


# Using PCA

In [66]:
from sklearn.decomposition import PCA

In [67]:
from sklearn.pipeline import Pipeline

In [69]:
pca=PCA()
pipe=Pipeline([('pca',pca),('rfc',rfc)])

In [70]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
         ...timators=1000, n_jobs=-1, oob_score=True, random_state=1,
            verbose=0, warm_start=False))])

In [71]:
pipe.score(X_test,y_test)

0.77676767676767677