In [1]:
# Feature normalisation
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [2]:
# Read the RandomForestClassification model using pickle
with open("RandomForestClassifier.pickle", "rb") as f:
    random_forest_classifier = pickle.load(f)

In [3]:
test_df = pd.read_csv("Test_Set_Values.csv")

In [4]:
# Store the 'ID' column
ID_data = test_df['id']

In [5]:
test_df['longitude_imputation'] = test_df['longitude']
test_df['latitude_imputation'] = test_df['latitude']

test_df['longitude_imputation'] = test_df['longitude_imputation'].replace(0, np.nan)
test_df['longitude_imputation'] = test_df['longitude_imputation'].fillna(test_df.groupby('region')['longitude_imputation'].transform('mean'))

test_df['latitude_imputation'] = test_df['latitude_imputation'].replace(-2.000000e-08, np.nan)
test_df['latitude_imputation'] = test_df['latitude_imputation'].fillna(test_df.groupby('region')['latitude_imputation'].transform('mean'))

In [6]:
test_df['recorded_year'] = pd.DatetimeIndex(test_df ['date_recorded']).year
test_df['age'] = test_df['recorded_year'] - test_df['construction_year']
test_df['age'] = test_df['age'].apply(lambda x: x if x < 100 else -1)
test_df = test_df.drop('recorded_year',axis=1)

In [7]:
test_df['funder'] = test_df['funder'].replace(['Kkkt_makwale', 'Kkkt Church','Kkkt-dioces Ya Pare','Kkkt Dme','Kkkt Ndrumangeni',
                      'Kkkt Canal','Kkkt Mareu','Kkkt Usa','Kkkt Leguruki'], 'Kkkt')
test_df['funder'] = test_df['funder'].replace(['Hhesawa'], 'Hesawa')
test_df['funder'] = test_df['funder'].replace(['Nrwssp','Rwssp Shinyanga','Drwssp'], 'Rwssp')
test_df['funder'] = test_df['funder'].replace(['Tasafu','Tasaf 1','Tasaf Ii'], 'Tasaf')
test_df['funder'] = test_df['funder'].replace(['Kilindi District Co','Songea District Council','Sengerema District Council','Mbozi District Council',
                                                       'Sangea District Council','Cdtfdistrict Council','Mkinga  Distric Cou'], 'District Council')
test_df['funder'] = test_df['funder'].replace(['Dhv Moro'], 'Dhv')
test_df['funder'] = test_df['funder'].replace(['Germany Republi','Germany'], 'Germany')
test_df['funder'] = test_df['funder'].replace(['Tcrs Kibondo','Tcrst'], 'Tcrs')
test_df['funder'] = test_df['funder'].replace(['Netherland'], 'Netherlands')
test_df['funder'] = test_df['funder'].replace(['Ruangwa Lga'], 'Lga')
test_df['funder'] = test_df['funder'].replace(['Amrefe'], 'Amref')
test_df['funder'] = test_df['funder'].replace(['Oxfam Gb'], 'Oxfam')
test_df['funder'] = test_df['funder'].replace(['Unicet'], 'Unicef')
test_df['funder'] = test_df['funder'].replace(['Rc Church','Rc','Rc Ch','Rc Churc','Rc Mission','Rc Cathoric','Rc Njoro','Rc Mofu','Rc Mi','Rc Missionary',
                                                       'Rc Missi','Rc Msufi'], 'RC')
test_df['funder'] = test_df['funder'].replace(['Missionaries','Missionary','Neemia Mission','German Missionary','Germany Missionary','Cpps Mission',
                                                      'Heri Mission'], 'Mission')
test_df['funder'] = test_df['funder'].replace(['Private Owned','Private Institutions','Private Co','Private Person','Private Individul'], 'Private')
test_df['funder'] = test_df['funder'].replace(['Roman Catholic','Roman Cathoric-same','Roman Cathoric Same','Roman Church','Roman Catholic Rulenge Diocese',
                                                      'Roman Cathoric -kilomeni','Roman Ca','Roman Cathoric','Roman Cathoric Church'], 'Roman')
test_df['funder'] = test_df['funder'].replace(['Rural Water Supply And Sanitat','Rural Water Supply And Sanita','Rural Water Supply','Rural Water Department'], 'Rural Water Supply')
test_df['funder'] = test_df['funder'].replace(['Ces (gmbh)'], 'Ces(gmbh)')
test_df['funder'] = test_df['funder'].replace(['Dwsdp'], 'WSDP')
test_df['funder'] = test_df['funder'].replace(['Finwater'], 'Finw')
test_df['funder'] = test_df['funder'].replace(['Plan International','Plan Internatio'], 'Plan Int')
test_df['funder'] = test_df['funder'].replace(['Oikos E.Afrika'], 'Oikos')
test_df['funder'] = test_df['funder'].replace(['Concern World Wide'], 'Concern')
test_df['funder'] = test_df['funder'].replace(['African Development Bank','African Relie','African Muslim Agency','African Development Foundation','African Barrick Gold',
                                                      'African Realief Committe Of Ku','African 2000 Network','African Reflections Foundation'], 'African')
test_df['funder'] = test_df['funder'].replace(['Snv Ltd','Snv-swash'], 'Snv')
test_df['funder'] = test_df['funder'].replace(['Villagers Mpi'], 'Villagers')
test_df['funder'] = test_df['funder'].replace(['Halmashauri Ya Wilaya Sikonge','Halmashauri Ya Manispa Tabora','Halmashauri Ya Wilaya','Halmashauri Wil','Gra Na Halmashauri'], 'Halmashauri')

In [8]:
def label_funder (row):
    if row['funder'] in ['Government Of Tanzania']:
        return "GOV"
    if row['funder'] in ['Unknown']:
        return "Unknown"
    if row['funder'] in ['Danida']:
        return "DANIDA"
    if row['funder'] in ['Hesawa']:
        return "HESAWA"
    if row['funder'] in ['Rwssp']:
        return "RWSSP"
    if row['funder'] in ['World Bank']:
        return "WORLD BANK"
    if row['funder'] in ['Kkkt']:
        return "ELCT"
    if row['funder'] in ['World Vision']:
        return "WORLD VISION"
    if row['funder'] in ['Unicef']:
        return "UNICEF"
    if row['funder'] in ['District Council']:
        return "DISTRICT COUNCIL"
    return "OTHER"

test_df['funder_grouped'] = test_df.apply(lambda row: label_funder(row), axis=1)

In [9]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [10]:
test_df = MultiColumnLabelEncoder(columns = ['quantity','extraction_type_class','waterpoint_type_group','payment_type','source_type','funder_grouped']).fit_transform(test_df)

In [11]:
feature_selected_df = test_df[['longitude_imputation','latitude_imputation','quantity','extraction_type_class','gps_height','age',
                               'waterpoint_type_group','population','payment_type','source_type','amount_tsh','funder_grouped']]

In [12]:
scaler = StandardScaler()
Xs_predict = scaler.fit_transform(feature_selected_df)

In [13]:
testing_predictions = random_forest_classifier.predict(Xs_predict)

In [15]:
target_df = pd.DataFrame({'ID': ID_data, 'status_group': testing_predictions})
target_df = target_df.replace({'status_group' : { 0 : "functional", 1 : "functional needs repair", 2 : "non functional" } } )
target_df.to_excel("Submission.xls")