# Pipeline

This notebook is the pipeline that combines all of our work. You are able to supply the pipeline with a csv file of inputs and the pipeline will output a csv file that is the predictions for the supplied inputs.

In [33]:
import pandas as pd
import numpy as np
import string

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder

import pickle

We don't include any pre-processing as this is all the test set. We only include feature engineering so that our model can output it's predictions.

In [5]:
def read_arrays():

    f = open("./funder_installer.csv", "r")
    lines = f.readlines()
    values = lines[1:]
    header = lines[0].strip('\n').split(",")

    data = {}

    for i in range(len(values)):

        data[header[i]] = values[i].strip('\n').split(",")

    gov = data['GOV']
    charity = data['CHARITY']
    local_gov = data['LOCAL_GOV']
    private = data['PRIVATE']
    foreign = data['FOREIGN']
    school = data['SCHOOL']
    religious = data['RELIGIOUS']
    
    return (gov, local_gov, private, religious, charity, school, foreign)

In [15]:
def cat_inst_fund(x, f, data):
    gov, local_gov, private, religious, charity, school, foreign = data
    try:
        x = f(x)
        if x in gov:
            return 'Government'
        elif x in local_gov:
            return 'Local Government'
        elif x in private:
            return 'Private'
        elif x in religious:
            return 'Religious'
        elif x in charity:
            return 'Charity'
        elif x in school:
            return 'School'
        elif x in foreign:
            return 'Foreign Aid'
        
        else:
            return 'Unknown'
    except AttributeError as e:
        return 'Unknown'

In [6]:
def label_construction_year (row):
    if row['construction_year'] in range(1960,1970):
        return "60s"
    if row['construction_year'] in range(1970,1980):
        return "70s"
    if row['construction_year'] in range(1980,1990):
        return "80s"
    if row['construction_year'] in range(1990,2000):
        return "90s"
    if row['construction_year'] in range(2000,2010):
        return "2000s"
    if row['construction_year'] in range(2010, 2020):
        return "2010s"
    return "Unknown"

In [7]:
def label_region (row):
    if row['region'] in ['Arusha','Manyara','Kilimanjaro','Tanga']:
        return "Northern Zone"
    if row['region'] in ['Kagera','Mwanza','Shinyanga','Mara']:
        return "Lake Zone"
    if row['region'] in ['Lindi','Ruvuma','Mtwara']:
        return "Southern Zone"
    if row['region'] in ['Rukwa','Mbeya','Iringa']:
        return "Southern Highlands"
    if row['region'] in ['Morogoro','Pwani','Dar es Salaam']:
        return "Coastal Zone"
    if row['region'] in ['Kigoma']:
        return "Western Zone"
    if row['region'] in ['Tabora','Singida','Dodoma']:
        return "Central Zone"
    return "Unknown"

In [8]:
# Author [Tom]

def minmax_normalisation(df, col):
    df[col + "_minmaxnormalised"] = ((df[col] - df[col].min()) / (df[col].max() / df[col].min()))
    return df

In [28]:
def feature_engineering(df):
    
    data = read_arrays()
    f = lambda x: x.upper().replace(" ", "").translate(str.maketrans('', '', string.punctuation))
    
    df['cat_funder'] = df['funder'].apply(lambda x: cat_inst_fund(x, f, data))
    df['cat_installer'] = df['installer'].apply(lambda x: cat_inst_fund(x, f, data))
    df['construction_decade'] = df.apply(lambda row: label_construction_year(row), axis=1)
    df['zones'] = df.apply(lambda row: label_region(row), axis=1)
    df = minmax_normalisation(df, "gps_height")
    df['permit'] = df['permit'].astype(str)
    df['public_meeting'] = df['public_meeting'].astype(str)
    
    
    return df

In [10]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [44]:
def feature_generation(df):
    df['recorded_year'] = pd.DatetimeIndex(df ['date_recorded']).year
    df['age'] = df['recorded_year'] - df['construction_year']
    df['age'] = df['age'].apply(lambda x: x if x < 100 else -1)
    df = df.drop('recorded_year',axis=1)
    
    
    df['month'] = pd.DatetimeIndex(df['date_recorded']).month

    # season encoder
    # 1: 'short dry', 2: 'long rain', 3: 'long dry', 4: 'short rain'
    season_mapper = {1: 1,2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3,
                     8: 3, 9: 3, 10: 3, 11: 4, 12: 4}
    #.p feature values to scale
    df['season']=df['month'].replace(season_mapper)
    df=df.drop('month', axis=1)
    
    df['consistent_water'] = np.where(df['quantity'] == 1, 1, 0)
    df['source_below_sea_level'] = np.where(df['gps_height'] < df['amount_tsh'], 1, 0)
    
    # These features are almost definitely being dropped!
    df.drop(columns = ['date_recorded', 'wpt_name','num_private','subvillage','region_code','district_code','lga','ward','recorded_by',
                       'scheme_name','extraction_type','extraction_type_group','payment','quality_group','quantity_group',
                       'source','waterpoint_type','construction_year','region','funder','installer'], inplace=True)
    
    # This list of features to use might change at some point.
    df = df[['id', 'extraction_type_class', 'payment_type', 'quantity', 'source_type',
               'waterpoint_type_group', 'cat_funder', 'construction_decade', 'consistent_water',
               'age', 'season', 'water_quality', 'cat_installer', 'gps_height_minmaxnormalised',
               'longitude', 'latitude', 'population']]
    
    
    
    
    return df

In [26]:
def preprocess(filename):
    """
    This function takes an input filename (dataframe), and outputs a dataframe
    that can be processed by our model.
    
    TODO: Parameterize this -> we need to get the columns from a json file
                            -> we need to get the new features from json? is this possible
                            
    Args:
        filename (String): The .csv file to read
    
    Returns:
        DataFrame : the output pandas DataFrame
    """
    df = pd.read_csv(filename)
    fe_transformer = FunctionTransformer(feature_engineering)
    mcl_encoder = MultiColumnLabelEncoder(columns = ['basin','public_meeting','scheme_management','permit','extraction_type_class',
                                        'management','management_group','payment_type','water_quality','quantity',
                                        'source_type','source_class','waterpoint_type_group',
                                        'construction_decade','zones', 'cat_funder', 'cat_installer'])
    fg_transformer = FunctionTransformer(feature_generation)
    preprocessing_pipeline = Pipeline([('fe', fe_transformer), ('mcl', mcl_encoder), ('fg', fg_transformer)])
    df = preprocessing_pipeline.fit_transform(df)
       
    return df

In [71]:
def main(filename, model_file):
    
    df = preprocess(filename)
    
    
    with open(model_file, 'rb') as f:
        xgb_clf = pickle.load(f)
        
    id_col = df['id']
    
    df.drop('id', axis=1, inplace=True)
    
    preds = xgb_clf.predict(df)
    
    final_df = pd.DataFrame(preds, columns=['status_group'])
    
    final_df = pd.concat([id_col, final_df], axis=1)
    
    final_df = final_df.replace({'status_group' : { 0 : "functional", 1 : "functional needs repair", 2 : "non functional" } } )
    
    final_df.to_csv("test_results.csv", index=False)
    
    return final_df
    
    

In [72]:
main("./datasets/test_dataset.csv", "./models/XGB.pickle")

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional
