# Train on All Data

## Load Items

In [1]:
import sys
sys.path.insert(0, '../preprocess')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

import importlib
import output_visualization as ov
importlib.reload(ov)
import feature_engineering as fe
import model_helper_funcs as mhf
importlib.reload(fe)

import xgboost as xgb

In [2]:
train_X, valid_X, train_y, valid_y = fe.get_data(
    '../data/train.csv', 
    '../data/train_labels.csv',
    valid_size=0.0 # train on all the data
)

memory used before preprocess:  19.00808

date time size before: 3.9798799999999996
date time size after:  0.47528 

converting funder                         size:  3.86	->	 0.33
converting installer                      size:  3.64	->	 0.34
converting basin                          size:  4.03	->	 0.06
converting subvillage                     size:  3.85	->	 2.03
converting region                         size:  3.78	->	 0.06
converting lga                            size:  3.83	->	 0.07
converting ward                           size:  3.83	->	 0.34
converting public_meeting                 size:  2.1	->	 0.06
converting recorded_by                    size:  4.75	->	 0.06
converting scheme_management              size:  3.55	->	 0.06
converting scheme_name                    size:  3.15	->	 0.4
converting permit                         size:  2.06	->	 0.06
converting extraction_type                size:  3.84	->	 0.06
converting extraction_type_group          size:  3.85	->	 0.06
con

In [3]:
trans = fe.DataCleaning()
transformed_train_X = trans.fit_transform(train_X)
train_y = transformed_train_X['status_group']
transformed_train_X.drop(['status_group'], axis=1, inplace=True)
col_names = transformed_train_X.columns

In [4]:
transformed_train_X.head()
# transformed_train_X.shape

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,region,district_code,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,management,payment_type,quality_group,quantity_group,source,waterpoint_type_group
2980,0.0,3648,Rural Water Supply And Sanitat,0,DWE,31.985658,-3.59636,Shinyanga,5,0,True,WUG,True,40.0,other,wug,unknown,good,dry,shallow well,other
5246,0.0,3673,OTHER,0,OTHER,32.832815,-4.944937,Tabora,6,0,True,VWC,True,40.0,handpump,vwc,never pay,milky,insufficient,shallow well,hand pump
22659,2.397895,3787,OTHER,1675,DWE,35.488289,-4.242048,Manyara,1,148,True,Water Board,True,48.0,gravity,water board,per bucket,good,insufficient,spring,communal standpipe
39888,0.0,3225,Kkkt,0,KKKT,33.140828,-9.059386,Mbeya,6,0,False,VWC,False,40.0,handpump,vwc,never pay,good,seasonal,shallow well,hand pump
13361,3.931826,3770,Wateraid,1109,SEMA,34.217077,-4.430529,Singida,1,235,True,WUA,True,51.0,motorpump,wua,per bucket,good,enough,machine dbh,communal standpipe


In [5]:
train_X_numeric, pipeline_ord = fe.transform_df(transformed_train_X, cat_encode_type="numeric")
train_X_1hot, pipeline_1hot = fe.transform_df(transformed_train_X, cat_encode_type="one hot")
train_X_numeric = pd.DataFrame(train_X_numeric, columns=col_names)
train_X_1hot.shape

(59400, 292)

In [6]:
train_X_numeric.shape

(59400, 21)

In [7]:
y_encoder_num, train_y_num = fe.encode_labels(train_y, convert_type="numeric")

## XGB

In [8]:
%%time
# Best Model

xgc = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=8,
    base_score=0.5,
    objective='multi:softprob', 
    random_state=42
)
xgc.fit(train_X_numeric, train_y_num)
print (xgc)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
CPU times: user 4min 56s, sys: 2.57 s, total: 4min 59s
Wall time: 5min 15s


## Final Test Data (Unlabeled)

In [9]:
# test = pd.read_csv('../data/test.csv') 
# test['status_group'] = np.nan
test = fe.get_test('../data/test.csv', )

memory used before preprocess:  4.752079999999999

date time size before: 0.99503
date time size after:  0.11888 

converting funder                         size:  0.96	->	 0.14
converting installer                      size:  0.91	->	 0.14
converting basin                          size:  1.01	->	 0.02
converting subvillage                     size:  0.96	->	 0.91
converting region                         size:  0.95	->	 0.02
converting lga                            size:  0.96	->	 0.03
converting ward                           size:  0.96	->	 0.24
converting public_meeting                 size:  0.53	->	 0.02
converting recorded_by                    size:  1.19	->	 0.02
converting scheme_management              size:  0.89	->	 0.02
converting scheme_name                    size:  0.78	->	 0.24
converting permit                         size:  0.51	->	 0.02
converting extraction_type                size:  0.96	->	 0.02
converting extraction_type_group          size:  0.96	->	 0.02
con

In [10]:
test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other,
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other,
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other,
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,


In [11]:
transformed_test = trans.transform(test)
transformed_test.drop(['status_group'], axis=1, inplace=True)
transformed_test.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,region,district_code,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,management,payment_type,quality_group,quantity_group,source,waterpoint_type_group
0,0.0,4333,OTHER,1996,OTHER,35.290799,-4.059696,Manyara,3,321,True,Parastatal,True,52.0,other,parastatal,never pay,good,seasonal,rainwater harvesting,other
1,0.0,4333,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Arusha,2,300,True,VWC,True,40.0,gravity,vwc,never pay,good,insufficient,spring,communal standpipe
2,0.0,4330,OTHER,1567,OTHER,34.767863,-5.004344,Singida,2,500,True,VWC,True,50.0,other,vwc,never pay,good,insufficient,rainwater harvesting,other
3,0.0,4320,OTHER,267,OTHER,38.058046,-9.418672,Lindi,43,250,True,VWC,True,27.0,other,vwc,unknown,good,dry,shallow well,other
4,6.216606,4384,OTHER,1260,OTHER,35.006123,-10.950412,Ruvuma,3,60,True,Water Board,True,40.0,gravity,water board,monthly,good,enough,spring,communal standpipe


In [12]:
test_numeric = pipeline_ord.transform (transformed_test)
test_1hot = pipeline_1hot.transform(transformed_test)
test_numeric = pipeline_ord.transform(transformed_test)
test_numeric = pd.DataFrame(data=test_numeric, columns=col_names, )
test_1hot.shape

(14850, 292)

## Test Prediction

In [13]:
y_pred = xgc.predict(test_numeric)

## Submission Format

In [15]:
submission_ex = pd.read_csv('../data/SubmissionFormat.csv')
submission_ex.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


## Final Results

In [61]:
cat_pred = y_encoder_num.inverse_transform (y_pred)
final_results = pd.DataFrame({
    'id': test['id'].values,
    'status_group': cat_pred
})

In [62]:
# final_results = pd.to_numeric(final_results)
# final_results = final_results.astype('category')

In [63]:
final_results.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [64]:
final_results.to_csv('../data/test_predictions.csv', index=False)

In [43]:
# final_results.dtypes

id              category
status_group    category
dtype: object

In [48]:
# y_encoder_num.classes_

array(['functional', 'functional needs repair', 'non functional'],
      dtype=object)

In [58]:
# # from sklearn.preprocessing.LabelEncoder import inverse_transform
# from sklearn.preprocessing import LabelEncoder
# # LabelEncoder.inverse_transform (final_results['status_group'])
# y_encoder_num.inverse_transform (final_results['status_group'])

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [57]:
# type (final_results['status_group'])

pandas.core.series.Series