In [72]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score as acc
from sklearn.preprocessing import StandardScaler as ss, RobustScaler as rs
from sklearn.model_selection import train_test_split as tts, GridSearchCV as GSCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC
import category_encoders as ce
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [73]:
train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')
train_features.shape, train_labels.shape, test_features.shape, sample_submission.shape

((59400, 40), (59400, 2), (14358, 40), (14358, 2))

In [74]:
train_labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [75]:
train_labels['baseline'] = 'functional'

In [76]:
train_features.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [77]:
train_features['year'] = train_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
test_features['year'] = test_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
train_year_mean = int(train_features['year'].mean(skipna=True))
test_year_mean = int(test_features['year'].mean(skipna=True))
train_features['year'] = train_features['year'].fillna(train_year_mean)
test_features['year'] = test_features['year'].fillna(test_year_mean)
train_features['year'] = train_features['year'].astype(int)
test_features['year'] = test_features['year'].astype(int)
train_features['year'] = pd.to_datetime(train_features['year'], format='%Y')
test_features['year'] = pd.to_datetime(test_features['year'], format='%Y')
train_features.shape, test_features.shape

((59400, 41), (14358, 41))

In [78]:
now = pd.Timestamp.now()
now

Timestamp('2019-06-05 08:49:08.554160')

In [79]:
train_features['age'] = (now - train_features['year']).dt.days
test_features['age'] = (now - test_features['year']).dt.days
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [80]:
def replace_zero_mean(df, training, feature):
    df[feature] = df[feature].map(lambda x: np.nan if x == 0 else x)
    df[feature].fillna(training[feature].mean(), inplace=True)

In [81]:
train_features['population'] = train_features['population'].map(lambda x: np.nan if x == 0 else x)
test_features['population'] = test_features['population'].map(lambda x: np.nan if x == 0 else x)
train_features['population'].fillna(train_features['population'].mean(skipna=True), inplace=True)
test_features['population'].fillna(test_features['population'].mean(skipna=True), inplace=True)
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [82]:
replace_zero_mean(train_features, train_features, 'population')
replace_zero_mean(test_features, train_features, 'population')

In [83]:
train_features['date_recorded'] = pd.to_datetime(train_features['date_recorded'])
test_features['date_recorded'] = pd.to_datetime(test_features['date_recorded'])
train_features['since_recording'] = (now - train_features['date_recorded']).dt.days
test_features['since_recording'] = (now - test_features['date_recorded']).dt.days
train_features.shape, test_features.shape

((59400, 43), (14358, 43))

In [84]:
train_features['since_recording'].head()

0    3005
1    2282
2    2291
3    2319
4    2884
Name: since_recording, dtype: int64

In [85]:
train_features = train_features.drop(columns='construction_year')
test_features = test_features.drop(columns='construction_year')
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [86]:
replace_zero_mean(train_features, train_features, 'latitude')
replace_zero_mean(test_features, train_features, 'latitude')
replace_zero_mean(train_features, train_features, 'longitude')
replace_zero_mean(test_features, train_features, 'longitude')

In [87]:
train_features['latitude_bins'] = pd.cut(train_features['latitude'], 20, labels=[str(x) for x in range(20)])
test_features['latitude_bins'] = pd.cut(test_features['latitude'], 20, labels=[str(x) for x in range(20)])

train_features['longitude_bins'] = pd.cut(train_features['longitude'], 20, labels=[str(x) for x in range(20)])
test_features['longitude_bins'] = pd.cut(test_features['longitude'], 20, labels=[str(x) for x in range(20)])
train_features['latitude_bins'].dtype

CategoricalDtype(categories=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
                  '12', '13', '14', '15', '16', '17', '18', '19'],
                 ordered=True)

In [88]:
replace_zero_mean(train_features, train_features, 'amount_tsh')
replace_zero_mean(test_features, train_features, 'amount_tsh')

In [89]:
replace_zero_mean(train_features, train_features, 'gps_height')
replace_zero_mean(test_features, train_features, 'gps_height')

In [90]:
pca_features = ['amount_tsh',
                'gps_height',
                'num_private',
                'region_code',
                'district_code',
                'population',
                'age']
pca_data = PCA(n_components=4).fit_transform(train_features[pca_features])

pca_test = PCA(n_components=4).fit_transform(test_features[pca_features])

train_features = train_features.drop(columns=pca_features)
test_features = test_features.drop(columns=pca_features)

for i in range(pca_data.shape[1]):
    train_features[f'pc{i}'] = pca_data[:,i]
    test_features[f'pc{i}'] = pca_test[:,i]
test_features.head()

Unnamed: 0,id,date_recorded,funder,installer,longitude,latitude,wpt_name,basin,subvillage,region,...,waterpoint_type,waterpoint_type_group,year,since_recording,latitude_bins,longitude_bins,pc0,pc1,pc2,pc3
0,50785,2013-02-04,Dmdd,DMDD,35.290799,-4.059696,Dinamu Secondary School,Internal,Magoma,Manyara,...,other,other,2012-01-01,2312,12,10,-5465.910555,86.177522,-982.96332,236.129194
1,51630,2013-02-04,Government Of Tanzania,DWE,36.656709,-3.309214,Kimnyak,Pangani,Kimnyak,Arusha,...,communal standpipe,communal standpipe,2000-01-01,2312,14,13,-1085.119667,19.170761,-551.084542,131.675126
2,17168,2013-02-01,,,34.767863,-5.004344,Puma Secondary,Internal,Msatu,Singida,...,other,other,2010-01-01,2315,11,9,-4738.526681,71.357465,-520.989872,314.45889
3,45559,2013-01-22,Finn Water,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,Ruvuma / Southern Coast,Kipindimbi,Lindi,...,other,other,1987-01-01,2325,3,15,3657.092297,-62.917491,728.887,-200.173634
4,49871,2013-03-27,Bruder,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,Ruvuma / Southern Coast,Losonga,Ruvuma,...,communal standpipe,communal standpipe,2000-01-01,2261,1,10,-1093.372501,-547.030269,-311.115368,-170.185416


In [96]:
cat_features = [
                'source_type',
#                 'quality_group',
                'extraction_type',
                'quantity_group',
#                 'management_group',
                'basin',
                'payment_type',
#                 'permit',
#                 'scheme_management'
                ]
# num_features = [
#                 'amount_tsh',
#                 'gps_height',
#                 'longitude',
#                 'latitude',
#                 'num_private',
#                 'region_code',
#                 'district_code',
#                 'population',
#                 'age',
# #                 'since_recording'
#                ]
num_features = ['pc0', 'pc1', 'pc2', 'pc3']
features = cat_features + num_features

train = train_features[features]
y_train = train_labels['status_group']

test = test_features[features]

X_train, X_val, y_train, y_val = tts(train,
                                     y_train,
                                     train_size=.75,
                                     test_size=.25,
                                     stratify=y_train,
                                     random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape, test.shape)

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)
X_test_features = encoder.transform(test)
print(X_train_enc.shape, X_val_enc.shape, X_test_features.shape)


(44550, 9) (14850, 9) (44550,) (14850,) (14358, 9)
(44550, 55) (14850, 55) (14358, 55)


In [98]:
%%time
model = RFC(
            n_estimators=250,
            min_samples_leaf=5,
#             criterion='entropy',
            max_features=.9,
            min_samples_split=9,
            random_state=42,
            bootstrap=True
            )
# parameters = {'n_estimators': range(100, 251, 50),
#               'min_samples_leaf': range(3, 7),
#               'criterion': ('entropy', 'gini'),
#               'max_features': [x / 100 for x in range(85, 100, 5)],
#               'min_samples_split': range(5, 13),
#               'bootstrap': (True, False),
#               }
# clf = GSCV(model,
#            parameters, 
#            cv=3,
#            n_jobs=-1,
#            verbose=3,
#            )

# clf.fit(X_train_enc, y_train)
model.fit(X_train_enc, y_train)
print(model.score(X_val_enc, y_val))
# print(clf.best_estimator_, clf.best_score_, clf.best_params_)

0.7839057239057239
Wall time: 2min


In [93]:
# model.score(X_val_enc, y_val)

In [94]:
acc(train_labels['status_group'], train_labels['baseline'])

0.543080808080808

In [95]:
# predicted = model.predict(X_test_features)
# submission = sample_submission.copy()
# submission['status_group'] = predicted
# submission.to_csv('sub_7.csv', index=False)