In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score as acc
from sklearn.preprocessing import StandardScaler as ss, RobustScaler as rs
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC, ExtraTreesClassifier as ETC
from sklearn.neural_network import MLPClassifier as MLP
import category_encoders as ce
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')
train_features.shape, train_labels.shape, test_features.shape, sample_submission.shape

((59400, 40), (59400, 2), (14358, 40), (14358, 2))

In [3]:
num_features = ['id',
                'latitude', 
                'longitude',
                'population',
                'construction_year',
                'amount_tsh',
                                       'gps_height']

In [4]:
num_data = train_features[num_features]
num_test = test_features[num_features]
num_data.head()

Unnamed: 0,id,latitude,longitude,population,construction_year,amount_tsh,gps_height
0,69572,-9.856322,34.938093,109,1999,6000.0,1390
1,8776,-2.147466,34.698766,280,2010,0.0,1399
2,34310,-3.821329,37.460664,250,2009,25.0,686
3,67743,-11.155298,38.486161,58,1986,0.0,263
4,19728,-1.825359,31.130847,0,0,0.0,0


In [5]:
num_data = num_data.set_index('id')
num_test = num_test.set_index('id')
num_data.head()

Unnamed: 0_level_0,latitude,longitude,population,construction_year,amount_tsh,gps_height
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
69572,-9.856322,34.938093,109,1999,6000.0,1390
8776,-2.147466,34.698766,280,2010,0.0,1399
34310,-3.821329,37.460664,250,2009,25.0,686
67743,-11.155298,38.486161,58,1986,0.0,263
19728,-1.825359,31.130847,0,0,0.0,0


In [6]:
imputer = IterativeImputer(missing_values=0,
                          initial_strategy='most_frequent',
                          imputation_order='random',
                          sample_posterior=True)
num_data = imputer.fit_transform(num_data)
num_test = imputer.transform(num_test)

In [7]:
train_features = train_features.drop(columns=['latitude', 
                                              'longitude',
                                              'population',
                                              'construction_year',
                                              'amount_tsh',
                                       'gps_height'])
test_features = test_features.drop(columns=['latitude', 
                                            'longitude',
                                            'population',
                                            'construction_year',
                                            'amount_tsh',
                                       'gps_height'])

In [8]:
data = pd.DataFrame(num_data, columns=['latitude', 
                                       'longitude',
                                       'population',
                                       'construction_year',
                                       'amount_tsh',
                                       'gps_height'])
test = pd.DataFrame(num_data, columns=['latitude', 
                                       'longitude',
                                       'population',
                                       'construction_year',
                                       'amount_tsh',
                                       'gps_height'])

In [9]:
data['id'] = train_features['id']
test['id'] = test_features['id']

In [10]:
train_features = train_features.merge(data)
test_features = test_features.merge(test)

In [11]:
train_features['construction_year'] = train_features['construction_year'].astype(int)
test_features['construction_year'] = test_features['construction_year'].astype(int)

In [12]:
train_features['construction_year'] = pd.to_datetime(train_features['construction_year'], format='%Y')
test_features['construction_year'] = pd.to_datetime(test_features['construction_year'], format='%Y')
now = pd.Timestamp.now()
train_features['age'] = (now - train_features['construction_year']).dt.days
test_features['age'] = (now - test_features['construction_year']).dt.days
train_features = train_features.drop(columns='construction_year')
test_features = test_features.drop(columns='construction_year')

In [13]:
cat_features = [
                'source_type',
#                 'quality_group',
                'extraction_type',
                'quantity_group',
#                 'management_group',
                'basin',
                'payment_type',
#                 'permit',
#                 'scheme_management'
                ]
num_features = [
                'amount_tsh',
                'gps_height',
                'longitude',
                'latitude',
                'num_private',
                'region_code',
                'district_code',
                'population',
                'age'
                ]
features = cat_features + num_features

train = train_features[features]
y_train = train_labels['status_group']

test = test_features[features]

X_train, X_val, y_train, y_val = tts(train,
                                     y_train,
                                     train_size=.8,
                                     test_size=.2,
                                     stratify=y_train,
                                     random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape, test.shape)

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)
X_test_features = encoder.transform(test)
print(X_train_enc.shape, X_val_enc.shape, X_test_features.shape)


(47520, 14) (11880, 14) (47520,) (11880,) (14358, 14)
(47520, 60) (11880, 60) (14358, 60)


In [14]:
%%time
model = RFC(n_estimators=250,
            min_samples_leaf=5,
            criterion='entropy',
            max_features=.9,
            min_samples_split=9,
            random_state=42,
            bootstrap=True
            )

model.fit(X_train_enc, y_train)

print(model.score(X_val_enc, y_val))

0.7968855218855219

In [15]:
# predicted = model.predict(X_test_features)
# submission = sample_submission.copy()
# submission['status_group'] = predicted
# submission.to_csv('sub_4.csv', index=False)