In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [19]:
#import the data
df = pd.read_csv(r'pump_model_data.csv')

In [20]:
df.columns

Index(['funder', 'installer', 'scheme_management', 'extraction_type_group',
       'quality_group', 'source_type', 'quantity', 'payment', 'basin',
       'waterpoint_type', 'age_at_record', 'id', 'status_group',
       'public_meeting', 'permit'],
      dtype='object')

In [21]:
#df.drop(columns='permit', inplace=True)
#df.drop(columns='public_meeting', inplace=True)
print(df.columns)

Index(['funder', 'installer', 'scheme_management', 'extraction_type_group',
       'quality_group', 'source_type', 'quantity', 'payment', 'basin',
       'waterpoint_type', 'age_at_record', 'id', 'status_group',
       'public_meeting', 'permit'],
      dtype='object')


In [22]:
dum_df = pd.get_dummies(df, columns=['funder', 'installer', 'scheme_management', 'extraction_type_group',
                                     'quality_group', 'source_type', 'quantity', 'payment', 'basin',
                                     'waterpoint_type', 'public_meeting', 'permit'] )

In [23]:
dum_df['status_group']= df.status_group

column_to_reorder = dum_df.pop('status_group')
dum_df.insert(0, 'status_group', column_to_reorder)
column_to_reorder_two = dum_df.pop('id')
dum_df.insert(0, 'id', column_to_reorder_two)

print(dum_df.columns)
feature_names = list(dum_df.columns)

Index(['id', 'status_group', 'age_at_record', 'funder_Danida',
       'funder_Government Of Tanzania', 'funder_Hesawa', 'funder_Kkkt',
       'funder_Other', 'funder_Rwssp', 'funder_Unicef', 'funder_World Bank',
       'funder_World Vision', 'installer_Commu', 'installer_DANIDA',
       'installer_DWE', 'installer_Government', 'installer_Hesawa',
       'installer_KKKT', 'installer_Other', 'installer_RWE',
       'scheme_management_Company', 'scheme_management_Other',
       'scheme_management_Parastatal', 'scheme_management_Private operator',
       'scheme_management_VWC', 'scheme_management_WUA',
       'scheme_management_WUG', 'scheme_management_Water Board',
       'scheme_management_Water authority', 'extraction_type_group_afridev',
       'extraction_type_group_gravity', 'extraction_type_group_india mark ii',
       'extraction_type_group_india mark iii', 'extraction_type_group_mono',
       'extraction_type_group_nira/tanira', 'extraction_type_group_other',
       'extraction_t

In [24]:
dum_df.status_group = dum_df['status_group'].astype('category').cat.codes
print(dum_df.status_group)

0        0
1        0
2        0
3        2
4        0
        ..
59395    0
59396    0
59397    0
59398    0
59399    0
Name: status_group, Length: 59400, dtype: int8


In [25]:
labels = dum_df['status_group']
labels_array = labels.squeeze().ravel()
print(labels_array)

dict_cats = {0: 'functional', 1 : 'functional needs repair', 2 : 'non functional'}


[0 0 0 ... 0 0 0]


In [26]:
X_data = dum_df.iloc[:, 2:]
y_data = dum_df.iloc[:, 1:2]


X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#convert the y data into an array
y_train = y_train.to_numpy().ravel()

(47520, 87)
(47520, 1)
(11880, 87)
(11880, 1)


In [27]:
forest_model = RandomForestClassifier(n_estimators=1000)
forest_model.fit(X_train, y_train)
print(forest_model.predict(X_test))
print(forest_model.score(X_test, y_test))

[2 0 0 ... 2 0 0]
0.783080808080808


In [28]:
!pip install xgboost




In [29]:
from xgboost import XGBClassifier

In [30]:
xg_model = XGBClassifier()

In [31]:
xg_model.fit(X_train, y_train)

In [34]:
print(xg_model.predict(X_test))
print(xg_model.score(X_test, y_test))

[2 0 0 ... 2 2 0]
0.7835016835016835


In [35]:
test_df = pd.read_csv(r'test_pump_model_data.csv')
test_dum_df = pd.get_dummies(test_df, columns=['funder', 'installer', 'scheme_management', 'extraction_type_group',
                                               'quality_group', 'source_type', 'quantity', 'payment', 'basin',
                                               'waterpoint_type', 'public_meeting', 'permit'])
print(test_dum_df.columns)
print(len(test_dum_df))
test_dum_df['status_group'] = ''
coll = test_dum_df.pop('status_group')
test_dum_df.insert(0, 'status_group', coll)
col_to_reorder = test_dum_df.pop('id')
test_dum_df.insert(0, 'id', col_to_reorder)
print(test_dum_df.columns)

print(test_dum_df.id)
test_dum_df.to_csv(r'test_data_for_ML_with_dummies.csv')
test_X_data = test_dum_df.iloc[:, 2:]
print(len(test_dum_df))
b = xg_model.predict(test_X_data)
print(len(b))
print(b)
test_dum_df.status_group = b
print(test_dum_df.status_group)
sub = pd.read_csv(r'SubmissionFormat.csv')
result_df = test_dum_df[['id', 'status_group']].copy()
print(result_df)
result_df.status_group = result_df.status_group.replace(dict_cats)
print(result_df)
result_df.to_csv(r'waterpump_submission.csv', index=False)

Index(['age_at_record', 'id', 'funder_Danida', 'funder_Government Of Tanzania',
       'funder_Hesawa', 'funder_Kkkt', 'funder_Other', 'funder_Rwssp',
       'funder_Unicef', 'funder_World Bank', 'funder_World Vision',
       'installer_Commu', 'installer_DANIDA', 'installer_DWE',
       'installer_Government', 'installer_Hesawa', 'installer_KKKT',
       'installer_Other', 'installer_RWE', 'scheme_management_Company',
       'scheme_management_Other', 'scheme_management_Parastatal',
       'scheme_management_Private operator', 'scheme_management_VWC',
       'scheme_management_WUA', 'scheme_management_WUG',
       'scheme_management_Water Board', 'scheme_management_Water authority',
       'extraction_type_group_afridev', 'extraction_type_group_gravity',
       'extraction_type_group_india mark ii',
       'extraction_type_group_india mark iii', 'extraction_type_group_mono',
       'extraction_type_group_nira/tanira', 'extraction_type_group_other',
       'extraction_type_group_other 