In [215]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score as acc
from sklearn.preprocessing import StandardScaler as ss, RobustScaler as rs
from sklearn.model_selection import train_test_split as tts, GridSearchCV as GSCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC
import category_encoders as ce
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [216]:
train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')
train_features.shape, train_labels.shape, test_features.shape, sample_submission.shape

((59400, 40), (59400, 2), (14358, 40), (14358, 2))

In [217]:
train_labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [218]:
train_labels['baseline'] = 'functional'

In [219]:
train_features.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [220]:
train_features['year'] = train_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
test_features['year'] = test_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
train_year_mean = int(train_features['year'].mean(skipna=True))
test_year_mean = int(test_features['year'].mean(skipna=True))
train_features['year'] = train_features['year'].fillna(train_year_mean)
test_features['year'] = test_features['year'].fillna(test_year_mean)
train_features['year'] = train_features['year'].astype(int)
test_features['year'] = test_features['year'].astype(int)
train_features['year'] = pd.to_datetime(train_features['year'], format='%Y')
test_features['year'] = pd.to_datetime(test_features['year'], format='%Y')
train_features.shape, test_features.shape

((59400, 41), (14358, 41))

In [221]:
now = pd.Timestamp.now()
now

Timestamp('2019-06-05 13:02:52.086935')

In [222]:
train_features['age'] = (now - train_features['year']).dt.days
test_features['age'] = (now - test_features['year']).dt.days
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [223]:
def replace_zero_mean(df, training, feature):
    df[feature] = df[feature].map(lambda x: np.nan if x == 0 else x)
    df[feature].fillna(training[feature].mean(), inplace=True)

In [224]:
train_features['population'] = train_features['population'].map(lambda x: np.nan if x == 0 else x)
test_features['population'] = test_features['population'].map(lambda x: np.nan if x == 0 else x)
train_features['population'].fillna(train_features['population'].mean(skipna=True), inplace=True)
test_features['population'].fillna(test_features['population'].mean(skipna=True), inplace=True)
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [225]:
replace_zero_mean(train_features, train_features, 'population')
replace_zero_mean(test_features, train_features, 'population')

In [226]:
train_features['date_recorded'] = pd.to_datetime(train_features['date_recorded'])
test_features['date_recorded'] = pd.to_datetime(test_features['date_recorded'])
train_features['since_recording'] = (now - train_features['date_recorded']).dt.days
test_features['since_recording'] = (now - test_features['date_recorded']).dt.days
train_features.shape, test_features.shape

((59400, 43), (14358, 43))

In [227]:
train_features['since_recording'].head()

0    3005
1    2282
2    2291
3    2319
4    2884
Name: since_recording, dtype: int64

In [228]:
train_features = train_features.drop(columns='construction_year')
test_features = test_features.drop(columns='construction_year')
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [229]:
replace_zero_mean(train_features, train_features, 'latitude')
replace_zero_mean(test_features, train_features, 'latitude')
replace_zero_mean(train_features, train_features, 'longitude')
replace_zero_mean(test_features, train_features, 'longitude')

In [230]:
# train_features['latitude_bins'] = pd.cut(train_features['latitude'], 20, labels=[str(x) for x in range(20)])
# test_features['latitude_bins'] = pd.cut(test_features['latitude'], 20, labels=[str(x) for x in range(20)])

# train_features['longitude_bins'] = pd.cut(train_features['longitude'], 20, labels=[str(x) for x in range(20)])
# test_features['longitude_bins'] = pd.cut(test_features['longitude'], 20, labels=[str(x) for x in range(20)])
# train_features['latitude_bins'].dtype

In [231]:
replace_zero_mean(train_features, train_features, 'amount_tsh')
replace_zero_mean(test_features, train_features, 'amount_tsh')

In [232]:
# replace_zero_mean(train_features, train_features, 'gps_height')
# replace_zero_mean(test_features, train_features, 'gps_height')

In [233]:
train_features = train_features.drop(columns=['date_recorded', 'year'])
test_features = test_features.drop(columns=['date_recorded', 'year'])

In [234]:
# pca_features = ['amount_tsh',
#                 'gps_height',
#                 'num_private',
#                 'region_code',
#                 'district_code',
#                 'population',
#                 'age']
# pca_data = PCA(n_components=4).fit_transform(train_features[pca_features])

# pca_test = PCA(n_components=4).fit_transform(test_features[pca_features])

# train_features = train_features.drop(columns=pca_features)
# test_features = test_features.drop(columns=pca_features)

# for i in range(pca_data.shape[1]):
#     train_features[f'pc{i}'] = pca_data[:,i]
#     test_features[f'pc{i}'] = pca_test[:,i]
# test_features.head()

In [235]:
cat_features = [
                'source_type',
                'quality_group',
                'extraction_type',
                'quantity_group',
                'management_group',
                'basin',
                'payment_type',
                'permit',
                'installer'
#                 'scheme_management'
                ]
num_features = [
                'amount_tsh',
                'gps_height',
                'longitude',
                'latitude',
                'num_private',
                'region_code',
                'district_code',
                'population',
                'age',
#                 'since_recording'
               ]
# num_features = ['pc0', 'pc1', 'pc2', 'pc3']
# features = cat_features + num_features

train = train_features
y_train = train_labels['status_group']

test = test_features

X_train, X_val, y_train, y_val = tts(train,
                                     y_train,
                                     train_size=.7,
                                     test_size=.3,
                                     stratify=y_train,
                                     random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape, test.shape)

encoder = ce.OrdinalEncoder()
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)
X_test_features = encoder.transform(test)
print(X_train_enc.shape, X_val_enc.shape, X_test_features.shape)


(41580, 40) (17820, 40) (41580,) (17820,) (14358, 40)
(41580, 40) (17820, 40) (14358, 40)


In [214]:
%%time
model = RFC(n_jobs=-1,
            n_estimators=250,
            min_samples_leaf=3,
#             min_samples_split=.6,
            max_features=.75,
            criterion='entropy',
#             random_state=42,
            bootstrap=True,
            max_depth=24,
#             class_weight='balanced',
            verbose=1,
            )
# parameters = {'n_estimators': range(100, 251, 50),
#               'min_samples_leaf': range(3, 7),
#               'criterion': ('entropy', 'gini'),
#               'max_features': [x / 100 for x in range(85, 100, 5)],
#               'min_samples_split': range(5, 13),
#               'bootstrap': (True, False),
#               }
# clf = GSCV(model,
#            parameters, 
#            cv=3,
#            n_jobs=-1,
#            verbose=3,
#            )

# clf.fit(X_train_enc, y_train)
model.fit(X_train_enc, y_train)
print(model.score(X_val_enc, y_val))
# print(clf.best_estimator_, clf.best_score_, clf.best_params_)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [93]:
# model.score(X_val_enc, y_val)

In [94]:
acc(train_labels['status_group'], train_labels['baseline'])

0.543080808080808

In [95]:
# predicted = model.predict(X_test_features)
# submission = sample_submission.copy()
# submission['status_group'] = predicted
# submission.to_csv('sub_7.csv', index=False)