In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 

In [2]:
afr = pd.read_csv(r"../share/train.csv")
afr_test = pd.read_csv(r"../share/test.csv")

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import re
import warnings
warnings.filterwarnings('ignore')

#from fuzzywuzzy import fuzz
#from fuzzywuzzy import process

import math
import pickle

In [4]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self,X, y=None):
        self.column_names = self.attribute_names
        return self
    def transform(self,X):
        return X[self.attribute_names]

In [5]:
class HandleCategoricalNulls(BaseEstimator, TransformerMixin):
    def __init__(self,substitute='unknown',convert_all_to_small=True):
        '''
           cols must be a list
        '''
        self.substitute = substitute
        self.convert_all_to_small = convert_all_to_small
        self.column_names = []
    def get_alpha_numeric(self,x):
        temp_l = list()
        for i in list(x):
            temp = re.sub(r'[^a-zA-Z0-9]',' ', i)
            temp = re.sub(' +'," ",temp)
            temp_l.append(temp)
        return temp_l
    def fit(self,X, y=None):
        HandleCategorical_columns = list(X.columns) 
        self.column_names = list(X.columns) 
        return self
    def transform(self,X):
        X=X.fillna(self.substitute)
        if self.convert_all_to_small:
            X = X.apply(lambda x: x.astype(str).str.lower())
            #Include only alpha-numeric chars
            #Remove any contiguous spaces    
            for i in X.columns:
                X[i] = self.get_alpha_numeric(X[i])
        return X        

In [6]:
class FreqBasedCategoricalBinning( BaseEstimator, TransformerMixin):
    def __init__(self,buckets=20,apply=True):
        '''
        buckets - Desired number of classes. 
        you can get less than or equal to the desired buckets,
        if the data is heavily skewed or if we have missing ranges
        '''
        #Calculate the number of bins
        try:
            self.bin_size=np.floor(100.0/(buckets-1))
        except:
            self.bin_size=5.0
        self.freq_dict={}
        self.column_names = []
        self.apply = apply
    def get_freq(self,df):
        #Get the frequency of each level in col 
        for col in df.columns:
            total_count=df.groupby([col]).size().reset_index()
            total_count.columns=[col,col+'_freq_bin']
            total_count.index = list(total_count[col])
            total_count = total_count.drop([col],axis=1)        
            #Save the result to a dictionary
            self.freq_dict[col] = \
            np.ceil(total_count.iloc[:,[0]]/np.max(total_count.iloc[:,0])*100)
    
    def join(self,X,key,value):
        #add a column called 'sorted' to save the order.
        X['sorted'] = np.arange(len(X))
        X.index = list(X[key])
        X.drop([key],axis=1,inplace=True)
        X=X.join(value,how='left')
        #Left join may result in NaN, 
        #if we have unseen levels in the variable
        #We will pad such values with median
        temp_val = np.median(value.iloc[:,[0]])
        X[[value.columns[0]]] = X[[value.columns[0]]].fillna(temp_val)
        
        #Bin the data
        X[[value.columns[0]]] = np.ceil(X[[value.columns[0]]]/self.bin_size)
        
        #Convert the bins to int and later to str
        X[[value.columns[0]]] = X[[value.columns[0]]].astype('int') 
        #X[[value.columns[0]]] = 'class_'+X[[value.columns[0]]].astype('str')
        X[[value.columns[0]]] = X[[value.columns[0]]].astype('str')
        #Reset and drop the first (new) column resulted in reset_index()
        X=X.reset_index()
        X.drop(X.columns[0],axis=1,inplace=True)
        #Restore the order:
        X.sort_values(['sorted'],inplace=True)
        X.drop(['sorted'],axis=1,inplace=True)
        return X
        
    def fit(self,X, y=None):
        if self.apply:
            self.get_freq(X)
            #FreqBasedCategoricalBinning = []
            #Do not set the column names here, to make the logic simple.
            self.column_names = []
            return self
        else:
            self.column_names = []
            return self      
       
    def transform(self,X,y=None):
        if self.apply:
            X = X.copy()
            for key, value in self.freq_dict.items():
                X = self.join(X,key,value)
            global FreqBasedCategoricalBinning_cols
            #Set the column names here
            self.column_names = list(X.columns)
            return X    
        else:
            self.column_names = list(X.columns)
            return X

In [7]:
class RespBasedCategoricalBinning( BaseEstimator, TransformerMixin):
    def __init__(self,buckets=20,apply=True):
        '''
        buckets - Desired number of classes. 
        you can get less than or equal to the desired buckets,
        if the data is heavily skewed or if we have missing ranges
        '''
        ##Determine the bin size
        try:
            self.bin_size=np.floor(100.0/(buckets-1))
        except:
            self.bin_size= 5.0
        #Declare a dictionary, which will save the details
        #in the fit() function
        self.freq_dict={}
        self.column_names = []
        self.apply = apply
        
    ##This function will be called by fit() function.    
    def get_probs(self,X,y):
        #Get the columns of the X data frame
        X_columns = list(X.columns)
        #Combine X and y into a single data frame
        df = X
        
        #_target_variable is the target column, which is a categorical column
        df['_target_variable'] = list(y)
        
        #For each column in the X data frame, perform the following:
        for col in X_columns:
        
            #Create a data frame with counts, group by the values of the column col
            total_count=df.groupby([col]).size().reset_index()
            
            #Assign the column names to the data frame with group counts
            total_count.columns=[col,'total']
            
            #Make the col as the index.
            #Pandas join is not behaving well if we are joining on a column.
            #So I am assigning the index using the to be joined col values
            total_count.index = list(total_count[col])
            
            #Drop the col as its values are in the index already.
            total_count = total_count.drop([col],axis=1)        
            
            #Now create another data frame, that contains the counts of 
            #values on col, group by target variable
            total_grp_count = df.groupby([col,'_target_variable']).size().reset_index()
            
            total_grp_count.columns=[col,'_target_variable','status_total']
            total_grp_count.index = total_grp_count[col]
            total_grp_count = total_grp_count.drop([col],axis=1)
            
            #Join the two data frames: total_count and total_grp_count
            #We have to use inner join, as we are certain that there will be NO mis-matches
            joined_df = total_count.join(total_grp_count,how='inner')
            joined_df = joined_df.reset_index()
            
            #Make sure that we have proper column names, after reset_index
            columns = list(joined_df.columns)
            columns[0] = col
            joined_df.columns = columns
            #print(joined_df)
            
            #Calculating the proportions
            joined_df['proportion'] = joined_df['status_total']/joined_df['total']
            
            #Pivot the table
            joined_df = pd.pivot_table(joined_df,values='proportion',\
                                       columns='_target_variable',index=col)
            #DO NOT reset the index on the pivot table.
            #As the pivoted table will have col values as the index, 
            #and this will be useful later in transform()
            
            #Fill the NaN values with 0
            joined_df = joined_df.fillna(0)
            for i in joined_df.columns:
                joined_df[i] = joined_df[i]/np.max(joined_df[i])
            #Rename the column names, by appending with col name. 
            #This will make sure that we do not have any duplicate columns            
            joined_df.columns = [col + '-'+str(i).replace(" ", "-") for i in joined_df.columns]
            #Save the col and pivoted table in a dictionary.
            #This dictionary will be used in transform() logic.
            self.freq_dict[col] = joined_df    
        #You have to drop the _target_variable, so that we revert back the changes to X
        df.drop(["_target_variable"],axis=1,inplace=True)
    #Define a function to help with the custom join            
    def join(self,X,key,value):
        '''
           X will be a data frame, on which we have to apply the transform()
           key will be a key in the self.freq_dict
           value will be the pivoted table corresponding to the key
        '''
        ##Create a column called 'sorted'  X. This will help
        ##us to restore the order of X later (or else there might be 
        ##change that the order of X might be disturbed)
        X['sorted'] = np.arange(len(X))
        #Make sure that X is indexed on the key column
        try:
            X.index = list(X[key])
            X.drop([key],axis=1,inplace=True)
            X=X.join(value,how='left')
        except:
            print("EXCEPTION/EROR: The input data frame does not have "+key+" column")
            print("Terminating the program")
         
        #Make sure that you fill the  NaN values with median
        #This is needed, to gracefully add unseen values in the categorical variables
        
        
        for i in value.columns:
            temp_val = np.median(value[i])
            X[i] = X[i].fillna(temp_val)
            X[i] = np.ceil(X[i]*100/self.bin_size)
            X[i] = X[i].astype('int') 
            X[i] = X[i].astype('str')
            
        X=X.reset_index()
        #Drop the first column, which is obtained by reset_index, as it is not needed
        X.drop(X.columns[0],axis=1,inplace=True)
        X.sort_values(['sorted'],inplace=True)
        X.drop(['sorted'],axis=1,inplace=True)
        return X
    #fit() will just build the self.freq_dict         
    #In the following func def, the y parameter is NOT optional
    def fit(self,X, y):
        if self.apply:
            self.get_probs(X,y)
            #To make the logic simple, we will set the column names in transform()
            self.column_names = []
            return self 
        else:
            self.column_names = []
            return self 
                 
    #transform() will iterate over the dictionary keys,
    #and build the transformation.
    #As we are using the self.freq_dict items,
    #even though the input data frame supplied to transform has 
    #extra values, we will not get any errors, and such columns will remain 
    #undisturbed. 
    def transform(self,X,y=None):
        if self.apply:
            X = X.copy()
            for key, value in self.freq_dict.items():
                X = self.join(X,key,value)
            self.column_names = list(X.columns) 
            return X
        else:
            self.column_names = list(X.columns) 
            return X    

In [8]:
class CatMultiLabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,apply=True): 
        self.column_names = [] 
        self.apply = apply
        self.binarizers={}
    
    def check_input_obj(self,X,location):
        ##Check if input object is a pandas df, else raise exception
        try:
            if not isinstance(X,pd.DataFrame):
                raise ValueError
        except:
            print("**EXCEPTION/ERROR**: In "+ location + \
                  " function of "+self.__name__+ ". Input must be a Pandas dataframe")
            exit(10)
        
    
    def fit(self, X, y=None):
        self.column_names = []
        if self.apply:         
            ##Check if input object is a pandas df, else raise exception
            self.check_input_obj(X,'fit()')
            ##Create an empty dict, 
            ##which will be updated with LabelBinarizer for each column        
            self.binarizers={}

            for col in X.columns:
                uniq_elements = list(set(X[col]))
                #print(uniq_elements)
                if len(uniq_elements) == 2:
                   ##Add a dummy class
                   #We have to name this class in a 
                   #weird fashion,so that no data has this class
                   uniq_elements.append('d#u/m*m-y+class_991-+xya')
                lb = LabelBinarizer()
                self.binarizers[col] = lb.fit(uniq_elements)
                #print(X)
                #self.column_names.append([str(col) + "_" + str(j) \ 
                #for j in list(lb.classes_) if j != 'd#u/m*m-y+class_991-+xya'])
                self.column_names = self.column_names + \
                     [str(col) + "_" + str(j) \
                       for j in list(lb.classes_) \
                         if j != 'd#u/m*m-y+class_991-+xya']
            #print("in transform")
            #print("len of self.binarizers",len(self.binarizers))
            #print("len of self.column_names",len(self.column_names))
            return self
        else:
            return self
    
    def transform(self, X, y=None):
         #print("in transform")
         #print("len of self.binarizers",len(self.binarizers))
         #print(self.apply)
        if self.apply:
            self.check_input_obj(X,'transform()')
            #X_transform = np.empty()
            temp_transformed_data = []
            transformed_column_names = []
            for key, value in self.binarizers.items():
                #print("key=",key)
                #print("value",value)
                #print("len of temp_transformed_data",len(temp_transformed_data))
                try:
                    temp_transformed_data.append(value.transform(X[key]))
                    transformed_column_names = \
                   transformed_column_names + \
                   [str(key) + "_" + str(j) \
                    for j in list(value.classes_)]
                except:
                    continue                
            return pd.DataFrame(np.concatenate(temp_transformed_data, axis=1),\
                                columns=transformed_column_names)[self.column_names]
            #transformed_column_names = self.column_names + [str(col) + "_" + \
            #str(j) for j in list(lb.classes_) if j != 'd#u/m*m-y+class_991-+xya']
            #transformed_X.columns = self.columns        
        else:
            self.column_names = list(X.columns)
            return X            

In [9]:
class AmountTSHTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,method='custom'): 
        self.column_names = [] 
        self.method = method
    
    def check_input_obj(self,X,location):
        ##Check if input object is a pandas df, else raise exception
        try:
            if not isinstance(X,pd.DataFrame):
                raise ValueError
        except:
            print("**EXCEPTION/ERROR**: In "+ location + \
                  " function of "+self.__name__+ \
                  ". Input must be a Pandas dataframe")
            exit(10)
        
    def fit(self, X, y=None):
        self.column_names = ['amount_tsh']
        if self.method == 'custom':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            self.check_input_obj(X,"fit()") 
            #Make sure that you have all the required columns:
            if len(set(X.columns) - set(['amount_tsh','source_class', \
                                         'basin', 'waterpoint_type_group'])) == 0:
                
                #Get the required dictionaries...
                amount_tsh_df = X[X['amount_tsh'] != 0].groupby(['source_class', \
                                                                 'basin', \
                                                                 'waterpoint_type_group'])\
                                                                  ['amount_tsh'].median()
                self.amount_tsh_dict_all_level = dict(amount_tsh_df)
                amount_tsh_df = X[X['amount_tsh'] != 0].\
                                groupby(['waterpoint_type_group'])\
                                ['amount_tsh'].median()
                self.amount_tsh_dict_wp = dict(amount_tsh_df)
            else:
                raise ValueError("Check the supplied columns. Must supply 'source_class', \
                                 'basin', 'waterpoint_type_group', 'amount_tsh' only")
                exit(10)
            self.column_names = ['amount_tsh']
            return self
        if self.method == 'median':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            self.median = np.median(list(X[X['amount_tsh'] != 0]['amount_tsh']))
            if math.isnan(self.median):
                self.median = 0
            self.column_names = ['amount_tsh']
            return self
        if self.method == 'mean':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            self.mean = np.mean(list(X[X['amount_tsh'] > 0]['amount_tsh']))
            if math.isnan(self.mean):
                self.mean = 0
            self.column_names = ['amount_tsh']
            return self
        if self.method == 'ignore':
            self.column_names = ['amount_tsh']
            return self   

    def transform(self,X):
        if self.method == 'custom':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            self.check_input_obj(X,"transform()")
            transformed_amount_tsh = []
            for i, j, k, l in list(zip(X['amount_tsh'].\
                                       fillna(0),X['source_class'], \
                                       X['basin'], X['waterpoint_type_group'])):
                if i == 0:
                    try:
                        transformed_amount_tsh.append(self.amount_tsh_dict_all_level[(j,k,l)])

                    except:
                        try:
                            transformed_amount_tsh.append(self.amount_tsh_dict_wp[l])
                        except:
                                transformed_amount_tsh.append(i)
                                continue
                else:
                        transformed_amount_tsh.append(i)
            X['amount_tsh'] = transformed_amount_tsh
            return X[['amount_tsh']]
        if self.method == 'median':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            X['amount_tsh'] = X['amount_tsh'].fillna(0)
            amount_tsh = np.array(list(X['amount_tsh']))
            amount_tsh[amount_tsh == 0] = self.median
            X['amount_tsh']  = amount_tsh
            return X[['amount_tsh']]
        if self.method == 'mean':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            X['amount_tsh'] = X['amount_tsh'].fillna(0)
            amount_tsh = np.array(list(X['amount_tsh']))
            amount_tsh[amount_tsh == 0] = self.mean
            X['amount_tsh']  = amount_tsh
            return X[['amount_tsh']]
        if self.method == 'ignore':
            X['amount_tsh'] = X['amount_tsh'].astype(float)
            X['amount_tsh'] = X['amount_tsh'].fillna(0)          
            return X[['amount_tsh']]

In [10]:
class GPSHeightTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,init_radius=0.1,increment_radius=0.3,method = 'custom'): 
        self.column_names = [] 
        self.init_radius = init_radius
        self.increment_radius = increment_radius
        self.method = method

    
    def get_subset_records(self, latitude,longitude,df,radius):
        latitude_from = latitude - radius
        latitude_to = latitude + radius
        longitude_from = longitude - radius
        longitude_to = longitude + radius
        
        df_temp = df[(df['latitude'] >= latitude_from) & (df['latitude'] <= latitude_to) & \
                  (df['longitude'] >= longitude_from) & (df['longitude'] <= longitude_to)]
        return df_temp
       
    
    def check_input_obj(self,X,location):
        ##Check if input object is a pandas df, else raise exception
        try:
            if not isinstance(X,pd.DataFrame):
                raise ValueError
        except:
            print("**EXCEPTION/ERROR**: In "+ \
                  location + " function of "+\
                  self.__name__+ \
                  ". Input must be a Pandas dataframe")
            exit(10)
        
    def fit(self, X, y=None):
        if self.method == 'custom':
            X['gps_height'] = X['gps_height'].astype(float)
            X['latitude'] = X['latitude'].astype(float)
            X['longitude'] = X['longitude'].astype(float)
            self.df = X[X['gps_height'] != 0]
            self.column_names = ['gps_height']
            return self
        if self.method == 'median':
            X['gps_height'] = X['gps_height'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.median = np.median(list(X[X['gps_height'] != 0]['gps_height']))

            if math.isnan(self.median):
                self.median = 0
            self.column_names = ['gps_height']
            return self
        
        if self.method == 'mean':
            X['gps_height'] = X['gps_height'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.mean = np.mean(list(X[X['gps_height'] != 0]['gps_height']))
            if math.isnan(self.mean):
                self.mean = 0
            self.column_names = ['gps_height']
            return self
        if self.method == 'ignore':
            self.column_names = ['gps_height']
            return self      
        
    def transform(self,X):
        if self.method == 'custom':
            X['gps_height'] = X['gps_height'].astype(float)
            X['latitude'] = X['latitude'].astype(float)
            X['longitude'] = X['longitude'].astype(float)
            
            gps_height_transformed = []
            for latitude, longitude, gps_height in \
                zip(X['latitude'],X['longitude'],X['gps_height']):
                radius = self.init_radius
                if gps_height == 0:
                    gps_height_temp = gps_height
                    while gps_height_temp == 0 and radius <= 2:
                        df_temp = self.get_subset_records\
                                  (latitude,longitude,self.df,radius)
                        
                        gps_height_temp = np.mean(df_temp[df_temp['gps_height']!=0]\
                                                  ['gps_height'])
                        if math.isnan(gps_height_temp):
                            gps_height_temp = 0 
                        radius = self.increment_radius + radius
                else:
                    gps_height_temp =gps_height
                gps_height_transformed.append(gps_height_temp)
            X['gps_height'] = gps_height_transformed
            self.column_names = ['gps_height']
            #self.column_names = list(X.columns)
            #return X[['latitude','longitude','gps_height']]
            return X[['gps_height']]
        if self.method == 'median':
            X['gps_height'] = X['gps_height'].astype(float)
            X['gps_height'] = X['gps_height'].fillna(0)
            gps_height = np.array(list(X['gps_height']))
            gps_height[gps_height == 0] = self.median
            self.column_names = ['gps_height']
            #self.column_names = list(X.columns)
            #return X[['latitude','longitude','gps_height']]
            X['gps_height'] = gps_height
            return X[['gps_height']]
        if self.method == 'mean':
            X['gps_height'] = X['gps_height'].astype(float)
            X['gps_height'] = X['gps_height'].fillna(0)
            gps_height = np.array(list(X['gps_height']))
            gps_height[gps_height == 0] = self.mean
            self.column_names = ['gps_height']
            #self.column_names = list(X.columns)
            #return X[['latitude','longitude','gps_height']]
            X['gps_height'] = gps_height
            return X[['gps_height']]
        if self.method == 'ignore':
            self.column_names = ['gps_height']
            X['gps_height'] = X['gps_height'].astype(float)
            X['gps_height'] = X['gps_height'].fillna(0)
            return X[['gps_height']]

In [11]:
class PopulationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,init_radius=0.1,increment_radius=0.3,method = 'custom'): 
        self.column_names = [] 
        self.init_radius = init_radius
        self.increment_radius = increment_radius
        self.method = method

    
    def get_subset_records(self, latitude,longitude,df,radius):
        latitude_from = latitude - radius
        latitude_to = latitude + radius
        longitude_from = longitude - radius
        longitude_to = longitude + radius
        
        df_temp = df[(df['latitude'] >= \
                      latitude_from) & (df['latitude'] <= latitude_to) & \
                  (df['longitude'] >= longitude_from) & \
                  (df['longitude'] <= longitude_to)]
        return df_temp
       
    
    def check_input_obj(self,X,location):
        ##Check if input object is a pandas df, else raise exception
        try:
            if not isinstance(X,pd.DataFrame):
               raise ValueError
        except:
            print("**EXCEPTION/ERROR**: In "+ location + \
                  " function of "+self.__name__+ ". Input must be a Pandas dataframe")
            exit(10)
        
    def fit(self, X, y=None):
        if self.method == 'custom':
            X['latitude'] = X['latitude'].astype(float)
            X['longitude'] = X['longitude'].astype(float)
            X['population'] = X['population'].astype(float)
            self.df = X[X['population'] > 1]
            self.column_names = ['population']
            return self
        if self.method == 'median':
            X['population'] = X['population'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.median = np.median(list(X[X['population'] != 0]['population']))
            if math.isnan(self.median):
                self.median = 0
            self.column_names = ['population']
            return self
        if self.method == 'mean':
            X['population'] = X['population'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.mean = np.mean(list(X[X['population'] != 0]['population']))
            if math.isnan(self.mean):
                self.mean = 0
            self.column_names = ['population']   
            return self
        if self.method == 'ignore':
            self.column_names = ['population']
            return self      
       
    def transform(self,X):
        self.column_names = ['population']
        if self.method == 'custom':      
            X['latitude'] = X['latitude'].astype(float)
            X['longitude'] = X['longitude'].astype(float)
            X['population'] = X['population'].astype(float)
        
            population_transformed = []
            for latitude, longitude, population in \
                zip(X['latitude'],X['longitude'],X['population']):
                radius = self.init_radius
                if population <= 1:
                    population_temp = population
                    while population_temp <= 1 and radius <= 2:
                        df_temp = self.get_subset_records\
                                  (latitude,longitude,self.df,radius)
                        
                        population_temp = np.mean(df_temp['population'])
                        if math.isnan(population_temp):
                            population_temp = population 
                        radius = self.increment_radius + radius
                else:
                    population_temp =population
                population_transformed.append(population_temp)
            X['population'] = population_transformed
            #self.column_names = ['population']
            #self.column_names = list(X.columns)
            self.column_names = ['population']
            return X[['population']]

        if self.method == 'median':      
                X['population'] = X['population'].astype(float)
                X['population'] = X['population'].fillna(0)
                population = np.array(list(X['population']))
                population[population == 0] = self.median
                self.column_names = ['population']
                #self.column_names = list(X.columns)
                #return X[['latitude','longitude','gps_height']]
                X['population'] = population
                return X[['population']]

        if self.method == 'mean':
                X['population'] = X['population'].astype(float)
                X['population'] = X['population'].fillna(0)
                population = np.array(list(X['population']))
                population[population == 0] = self.mean
                self.column_names = ['population']
                #self.column_names = list(X.columns)
                #return X[['latitude','longitude','gps_height']]
                X['population'] = population
                return X[['population']]
        
        if self.method == 'ignore':      
                X['population'] = X['population'].astype(float)
                X['population'] = X['population'].fillna(0)
                return X[['population']]

In [12]:
class YearTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,method = 'custom'): 
        self.column_names = [] 
        #self.init_radius = init_radius
        #self.increment_radius = increment_radius
        self.method = method
        pass ##Nothing else to do
        
    def fit(self, X, y=None):
        X['construction_year'] = X['construction_year'].astype(float)
        if self.method == 'custom':
            year_recorded = X[X['construction_year'] > 0]\
                            ['date_recorded'].\
                            apply(lambda x: int(x.split("-")[0]))
            year_constructed = X[X['construction_year'] > 0]['construction_year']
            self.median_age = np.median(year_recorded - year_constructed)
            self.column_names = ['age']
            return self
        if self.method == 'median':
            X['construction_year'] = X['construction_year'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.median = \
                          np.median(list(X[X['construction_year'] != 0]['construction_year']))
            if math.isnan(self.median):
                self.median = 0
            self.column_names = ['construction_year']
            return self
        if self.method == 'mean':
            X['construction_year'] = X['construction_year'].astype(float)
            #X['gps_height'] = X['gps_height'].fillna(0)
            self.mean = np.mean(list(X[X['construction_year'] != 0]['construction_year']))
            if math.isnan(self.mean):
                self.mean = 0
            self.column_names = ['construction_year']
            return self

        if self.method == 'ignore':
            self.column_names = ['construction_year']
            return self
          
    def transform(self,X):
        if self.method == 'custom':
            year_recorded = list(X['date_recorded'].apply(lambda x: int(x.split("-")[0])))
            year_constructed = list(X['construction_year'])
            age = []
            for i,j in enumerate(year_constructed):
                if j == 0:
                    age.append(self.median_age)
                else:
                    temp_age = year_recorded[i] - year_constructed[i]
                    if temp_age < 0:
                        temp_age = self.median_age
                    age.append(temp_age)   
            X['age'] = age
            self.column_names = ['age']
            #self.column_names = X.columns
            return X[['age']]
        if self.method == 'median':      
                X['construction_year'] = X['construction_year'].astype(float)
                X['construction_year'] = X['construction_year'].fillna(0)
                construction_year = np.array(list(X['construction_year']))
                construction_year[construction_year == 0] = self.median
                self.column_names = ['construction_year']
                X['construction_year'] = construction_year
                return X[['construction_year']]

        if self.method == 'mean':
                X['construction_year'] = X['construction_year'].astype(float)
                X['construction_year'] = X['construction_year'].fillna(0)
                construction_year = np.array(list(X['construction_year']))
                construction_year[construction_year == 0] = self.mean
                self.column_names = ['construction_year']
                X['construction_year'] = construction_year
                return X[['construction_year']]
        
        if self.method == 'ignore':      
                X['construction_year'] = X['construction_year'].astype(float)
                X['construction_year'] = X['construction_year'].fillna(0)
                self.column_names = ['construction_year']
                return X[['construction_year']]

In [13]:
class LatitudeLongitudeProcess( BaseEstimator, TransformerMixin):
    def __init__(self,strategy='median'):
        '''
           type = 'median' is the default.
           other values of type can be 'custom' 
        ''' 
        self.strategy = strategy
        self.median_longitude = 0
        self.custom_longitude = 0
        self.median_latitude = 0
        self.custom_latitude = 0
        self.avg_lat_ward_dict = {}
        self.avg_long_ward_dict = {}
        self.avg_lat_lga_dict = {}
        self.avg_long_lga_dict = {}
        self.avg_lat_region_dict = {}
        self.avg_long_region_dict = {}
        self.avg_lat_country_dict = {}
        self.avg_long_country_dict = {}
        self.column_names = [] 
        
    def get_level_means(self,X):
        if 'ward' in X.columns:
            #Get average of lats and longs at the ward level
            #First delete rows that have unknown ward values:
            df = X[~((X['ward'].isnull()) | (X['ward'] == 'unknown'))]
            avg_lat_long_by_ward_df = df[df['longitude'] != 0]. \
            groupby(['ward'])['latitude','longitude'].mean().reset_index()
            if len(avg_lat_long_by_ward_df) > 0:
                avg_lat_long_by_ward_df.columns=['ward','avg_latitude','avg_longitude']
                self.avg_lat_ward_dict = dict(zip(list(avg_lat_long_by_ward_df['ward']),\
                                                      list(avg_lat_long_by_ward_df['avg_latitude'])))
                self.avg_long_ward_dict = dict(zip(list(avg_lat_long_by_ward_df['ward']),\
                                                       list(avg_lat_long_by_ward_df['avg_longitude'])))
        if 'lga' in X.columns:        
            #Get average of lats and longs at the lga level
            #First delete rows that have unknown region values:
            df = X[~((X['lga'].isnull()) | (X['lga'] == 'unknown'))]
            avg_lat_long_by_lga_df = df[df['longitude'] != 0]. \
                groupby(['lga'])['latitude','longitude'].mean().reset_index()
            if len(avg_lat_long_by_lga_df) > 0:
                avg_lat_long_by_lga_df.columns=['lga','avg_latitude','avg_longitude']
                self.avg_lat_lga_dict = dict(zip(list(avg_lat_long_by_lga_df['lga']),
                                                     list(avg_lat_long_by_ward_df['avg_latitude'])))
                self.avg_long_lga_dict = dict(zip(list(avg_lat_long_by_lga_df['lga']),
                                                      list(avg_lat_long_by_ward_df['avg_longitude'])))
        if 'region' in X.columns:                
            #Get average of lats and longs at the region level
            #First delete rows that have unknown region values:
            df = X[~((X['region'].isnull()) | (X['region'] == 'unknown'))]
            avg_lat_long_by_region_df = df[df['longitude'] != 0]. \
                groupby(['region'])['latitude','longitude'].mean().reset_index()
            if len(avg_lat_long_by_region_df) > 0:
                avg_lat_long_by_region_df.columns=['region','avg_latitude','avg_longitude']
                self.avg_lat_region_dict = dict(zip(list(avg_lat_long_by_region_df['region']),\
                                   list(avg_lat_long_by_region_df['avg_latitude'])))
                self.avg_long_region_dict = dict(zip(list(avg_lat_long_by_region_df['region']),\
                                    list(avg_lat_long_by_region_df['avg_longitude'])))
                
        #Get average of lats and longs at the country level
        avg_long = np.mean(X[X['longitude'] != 0]['longitude'])
        avg_lat = np.mean(X[X['latitude'] != 0]['latitude'])
        self.avg_lat_country_dict['country'] = avg_lat
        self.avg_long_country_dict['country'] = avg_long
        
    def fit(self,X, y=None):
        self.column_names = ['latitude','longitude']
        X['latitude'] = X['latitude'].astype(float)
        X['longitude'] = X['longitude'].astype(float)
        if self.strategy == 'custom':
            self.get_level_means(X)
        elif self.strategy == 'mean':
            #Impute using mean
            self.mean_longitude = np.mean(X[X['longitude'] != 0]['longitude'])
            self.mean_latitude = np.mean(X[X['latitude'] != 0]['latitude'])           
            #X.longitude = [i for i in X.longitude if np.abs(i) <= 0 self.mean_longitude else i]
            #X.latitude  = [i for i in X.latitude if np.abs(i) <= 0 self.mean_latitude else i]
        elif self.strategy == 'median':
            #Impute using median
            self.median_longitude = np.median(X[X['longitude'] != 0]['longitude'])
            self.median_latitude = np.median(X[X['latitude'] != 0]['latitude'])           
            #X.longitude = [i for i in X.longitude if np.abs(i) <= 0 self.median_longitude else i]
            #X.latitude  = [i for i in X.latitude if np.abs(i) <= 0 self.median_latitude else i]
        else:
            print("Invalid strategy supplied for LatitudeLongitudeProcess.")
            print("Valid values are 'mean', 'median' or 'custom'. Terminating the program")
            exit(10)
        return self
    def make_up_lat_long(self,X):
        #Handle the situation gracefully, if the incoming data does not have any required columns
        try:
            latitude_list = list(X['latitude'].fillna(0))
        except:
            latitude_list = list(np.zeros(len_X))
            #continue
        try:          
            longitude_list = list(X['longitude'].fillna(0))
        except:
            longitude_list = list(np.zeros(len_X))
            #continue
        return latitude_list, longitude_list   
        
    def custom_transform(self, X):
        len_X = len(X)
          
        #Declare lists to hold the transformed lat and long
        latitude_transformed = []
        longitude_transformed = []
          
        #Handle the situation gracefully, if the incoming data does not have any required columns
        latitude_list, longitude_list =  self.make_up_lat_long(X)
          
        try:
            ward_list = list(X['ward'].fillna('unknown'))
        except:
            ward_list = ['unknown'] * len_X
            #continue
              
        try:    
            lga_list = list(X['lga'].fillna('unknown'))
        except:
            lga_list = ['unknown'] * len_X
            #continue
              
        try:    
            region_list = list(X['region'].fillna('unknown'))
        except:
            region_list = ['unknown'] * len_X
            #continue
              
        for (i, j, k, l, m) in zip(latitude_list,longitude_list, \
                                    ward_list,lga_list,region_list):
            if np.round(i) == 0 or np.round(j) == 0:
                try:
                    latitude_transformed.append(self.avg_lat_ward_dict[k])
                    longitude_transformed.append(self.avg_long_ward_dict[k])
                except:
                    try:
                        latitude_transformed.append(self.avg_lat_lga_dict[l])
                        longitude_transformed.append(self.avg_long_lga_dict[l])
                        continue
                    except:
                        try:
                            latitude_transformed.append(avg_lat_region_dict[m])
                            longitude_transformed.append(avg_long_region_dict[m])
                            continue
                        except:   
                            latitude_transformed.append(self.avg_lat_country_dict['country'])
                            longitude_transformed.append(self.avg_long_country_dict['country'])
                            continue
            else:
                latitude_transformed.append(i)
                longitude_transformed.append(j)     
        X['latitude'] = latitude_transformed
        X['longitude'] = longitude_transformed          
        return X
    def transform(self,X):
        X['latitude'] = X['latitude'].astype(float)
        X['longitude'] = X['longitude'].astype(float)
        self.column_names = ['latitude','longitude'] 
        if self.strategy == 'custom':
            X = self.custom_transform(X)
            return X[['latitude','longitude']]
        elif self.strategy == 'mean':
            latitude_list, longitude_list =  self.make_up_lat_long(X)
            latitude_list = np.array(latitude_list)
            longitude_list = np.array(longitude_list)
            latitude_list[latitude_list == 0] = self.mean_latitude
            longitude_list[longitude_list == 0] = self.mean_longitude
            X['latitude'] = latitude_list
            X['longitude'] = longitude_list
            return X[['latitude','longitude']]
        elif self.strategy == 'median':
            latitude_list, longitude_list =  self.make_up_lat_long(X)
            latitude_list = np.array(latitude_list)
            longitude_list = np.array(longitude_list)
            latitude_list[latitude_list == 0] = self.median_latitude
            longitude_list[longitude_list == 0] = self.median_longitude
            X['latitude'] = latitude_list
            X['longitude'] = longitude_list
            return X[['latitude','longitude']]
             #self.column_names = list(X.columns)
             #return X
        else:
            print("EXCEPTION: The supplied strategy",self.strategy," is incorrect")
            exit(10)

In [14]:
class FunderInstTransformer( BaseEstimator, TransformerMixin):
    def __init__(self,initial_chars=3,groups=15,apply=True):
        self.initial_chars = initial_chars
        self.groups = groups
        self.apply = apply
        self.group_dict = dict()
    def fetch_first_n_chars(self,l):
        temp_l = [str(j).lower()[0:self.initial_chars] for j in list(l)]
        return pd.Series(temp_l)
                
    def fit(self,X, y=None):
        if self.apply:
            self.column_names = ['funder','installer'] 
            self.group_dict = dict()
            for i in X.columns:
                    temp_series = self.fetch_first_n_chars(X[i]).value_counts()
                    temp_series.sort_values(ascending=False,inplace=True)
                    top_groups = list(temp_series[0:self.groups].index)
                    self.group_dict[i] = top_groups

            return self
        else:
            return self
        
    def transform(self,X):
        X = X.copy()
        if self.apply:
            for i in X.columns:
                    temp_series = self.fetch_first_n_chars(X[i])
                    temp_l = []
                    for j in temp_series.values:
                        try:
                            #print(self.group_dict[i])
                            if j in self.group_dict[i]:
                                #print(j)
                                temp_l.append(j)  
                            else: 
                                temp_l.append('other')  
                        except:
                            continue
                    X[i] = temp_l
            #X['funder_installer_same'] = X['funder']==X['installer']
            #self.column_names = ['funder','installer','funder_installer_same']
            #return X[['funder','installer','funder_installer_same']]         
            self.column_names = ['funder','installer']
            return X[['funder','installer']]         
        else:
            #self.column_names = ['funder','installer','funder_installer_same']
            #return X[['funder','installer','funder_installer_same']]
            self.column_names = ['funder','installer']
            return X[['funder','installer']]         
         
class Numpy2DFTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,columns): 
        self.column_names = columns
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):

        return pd.DataFrame(X,columns=self.column_names)
    ##############
    ##IMPORTANT###
    ##############
    #https://stackoverflow.com/questions/41837261/data-not-persistent-in-scikit-learn-transformers
    #get_params() is very important to get the data persistent between
    #the CV evaluation. If NOT defined, then the init params are NOT set and results in 
    #CV (GridSearch) failure  
    def get_params(self, deep=False):
        return {'columns': self.column_names}

#Scale numeric data        
class ScaleData(BaseEstimator, TransformerMixin): 
     def __init__(self,std_scaler=True,apply=True):
            self.std_scaler = std_scaler
            self.apply = apply
 
     def fit(self,X, y=None):
        if self.apply: 
            if self.std_scaler == True: 
                self.scaler = StandardScaler() 
            else: 
                self.scaler = MinMaxScaler() 
            return self.scaler.fit(X) 
        else:
            return X        
 
     def transform(self,X): 
        if self.apply:
            return self.scaler.transform(X) 
        else:
            return X

In [15]:
class ChooseCatPipelineType(BaseEstimator, TransformerMixin): 
    def __init__(self,freq_pipeline,resp_pipeline,method='both'):
            self.freq_pipeline = freq_pipeline
            self.resp_pipeline = resp_pipeline
            self.method = method
            self.column_names = []
    def fit(self,X, y=None): 
        if self.method == 'resp':
            self.resp_pipeline.fit(X,y) 
        if self.method == 'freq':
            self.freq_pipeline.fit(X,y) 
        if self.method == 'both':
            self.both_pipeline = FeatureUnion(transformer_list = [ \
                                    ("freq_pipeline",self.freq_pipeline), \
                                    ("resp_pipeline",self.resp_pipeline) \
                                                ])
            self.both_pipeline.fit(X,y)                                                
        return self

    def transform(self,X): 
        if self.method == 'resp':
            self.column_names = self.resp_pipeline.named_steps['CatMultiLabelTransformer'].column_names
            return self.resp_pipeline.transform(X) 
        if self.method == 'freq':
            self.column_names = self.freq_pipeline.named_steps['CatMultiLabelTransformer'].column_names
            return self.freq_pipeline.transform(X) 
        if self.method == 'both':
            self.column_names = self.freq_pipeline.named_steps['CatMultiLabelTransformer'].column_names + \
                              self.resp_pipeline.named_steps['CatMultiLabelTransformer'].column_names
            return self.both_pipeline.transform(X)                         

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
# cnf_test
def error_met(y_test, y_check):
    
    cnf = confusion_matrix(y_test, y_check)
    
    fpr = cnf[1,0]/cnf[1,:].sum()
    fnr = cnf[0,1]/cnf[0,:].sum()
    return 0.9*fpr+0.1*fnr

In [18]:
# error_met(cnf_train)
# cnf_test
def error(y_test, y_check):
    
    cnf = confusion_matrix(y_test, y_check)
    
    fpr = cnf[1,0]/cnf[1,:].sum()
    fnr = cnf[0,1]/cnf[0,:].sum()
    return -(0.9*fpr+0.1*fnr)*accuracy_score(y_test, y_check)

In [19]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Method 1: Random Forest

In [20]:
train_data_type = {
'new_ids':int,
'amount_tsh': float,
'date_recorded': str,
'funder': str,
'gps_height': float,
'installer': str,
'longitude': float,
'latitude': float,
'wpt_name': str,
'num_private': float,
'basin': str,
'subvillage': str,
'region': str,
'region_code': str,
'district_code': str,
'lga': str,
'ward': str,
'population': float,
'public_meeting': str,
'recorded_by': str,
'scheme_management': str,
'scheme_name': str,
'permit': str,
'construction_year': int,
'extraction_type': str,
'extraction_type_group': str,
'extraction_type_class': str,
'management': str,
'management_group': str,
'payment': str,
'payment_type': str,
'water_quality': str,
'quality_group': str,
'quantity': str,
'quantity_group': str,
'source': str,
'source_type': str,
'source_class': str,
'waterpoint_type': str,
'waterpoint_type_group': str
}                      

train_label_type = {
'new_ids':int,
'defective': str
}

In [21]:
high_levels_cat_columns = ['subvillage','lga','ward']#,'scheme_name']

#Columns with less number of levels.
low_levels_cat_columns = ['basin','region','district_code',
                          'public_meeting',
                          'scheme_management',
                          'permit',
                          #'extraction_type',
                          'extraction_type_group',
                          #'extraction_type_class',
                          'management',
                          #'management_group',
                          'payment_type',
                          'water_quality',
                          #'quality_group',
                          'quantity_group',
                          'source',
                          #'source_type','source_class',
                          'waterpoint_type'
                          #,'waterpoint_type_group'
                          ]

#Columns which need fuzzy matching. These columns have many levels.
fuzzy_logic_columns = ['funder','installer']


#Pipelines definition
fuzz_pipeline = Pipeline([ \
                         ('selector',DataFrameSelector(fuzzy_logic_columns)), \
                         ('cat_nulls', HandleCategoricalNulls()), \
                         ('FunderInstTransformer', \
                          FunderInstTransformer(initial_chars = 3, \
                                                groups = 15,apply=True)), \
                         ('CatMultiLabelTransformer',\
                          CatMultiLabelTransformer(apply=True)) \
                         ])

cat_pipeline_high_level_freq_based = Pipeline([ \
                         ('selector',DataFrameSelector(high_levels_cat_columns)), \
                         ('cat_nulls', HandleCategoricalNulls()), \
                         ('FreqBasedCategoricalBinning', \
                          FreqBasedCategoricalBinning(buckets=20,apply=True)), \
                         ('CatMultiLabelTransformer',CatMultiLabelTransformer()) \
                         ])
                         
cat_pipeline_high_level_resp_based = Pipeline([ \
                         ('selector',DataFrameSelector(high_levels_cat_columns)), \
                         ('cat_nulls', HandleCategoricalNulls()), \
                         ('RespBasedCategoricalBinning', \
                          RespBasedCategoricalBinning(buckets=20,apply=True)), \
                         ('CatMultiLabelTransformer',CatMultiLabelTransformer()) \
                         ])

choose_high_level_cat_pipeline = Pipeline([ \
                                         ('ChooseCatPipelineType', ChooseCatPipelineType( \
                                         freq_pipeline = cat_pipeline_high_level_freq_based, \
                                         resp_pipeline = cat_pipeline_high_level_resp_based, \
                                         method = 'freq')) \
                                         ])
                                         
cat_pipeline_low_level = Pipeline([ \
                         ('selector',DataFrameSelector(low_levels_cat_columns)), \
                         ('cat_nulls', HandleCategoricalNulls()), \
                         ('CatMultiLabelTransformer',CatMultiLabelTransformer(apply=True)) \
                        ])

#Combine all categorical pipelines first:
full_categorical_pipeline = FeatureUnion(transformer_list = [ \
                                         ("fuzz_pipeline",fuzz_pipeline), \
                                         ("choose_high_level_cat_pipeline",\
                                          choose_high_level_cat_pipeline), \
                                         ("cat_pipeline_low_level", cat_pipeline_low_level) \
                                         ])       

#amount_tsh pipelines
amount_tsh_prep_pipeline = Pipeline([ \
                         ('selector',\
                          DataFrameSelector(['source_class', 'basin', \
                                             'waterpoint_type_group'])), \
                         ('cat_nulls', HandleCategoricalNulls()) \
                         ])

amount_tsh_selector = Pipeline([('selector',DataFrameSelector(['amount_tsh']))])

amount_tsh_transformer = Pipeline([ ('amount_tsh_prep',FeatureUnion(transformer_list = [ \
                                    ("amount_tsh_prep_pipeline",amount_tsh_prep_pipeline), \
                                    ("amount_tsh_selector",amount_tsh_selector)])) \
                                    ,("Numpy2DFTransformer", \
                                      Numpy2DFTransformer(['source_class', \
                                                           'basin', \
                                                           'waterpoint_type_group',\
                                                           'amount_tsh'])) \
                                    ,('AmountTSHTransformer', \
                                      AmountTSHTransformer()) \
                                ])

#age pipeline
age_pipeline = Pipeline([('selector',DataFrameSelector(['date_recorded',\
                                                        'construction_year'])), \
                         ('YearTransformer',YearTransformer())])

#Lat Long pipelines
lat_long_prep_pipeline = Pipeline([('selector',DataFrameSelector(['lga',\
                                                        'region',\
                                                        'ward'])), \
                         ('cat_nulls', HandleCategoricalNulls())])

lat_long_selector = Pipeline([('selector',DataFrameSelector(['longitude','latitude']))])

lat_long_transformer = Pipeline([ ('lat_long_prep',\
                                   FeatureUnion(transformer_list = [ \
                                    ("lat_long_prep_pipeline",\
                                     lat_long_prep_pipeline), \
                                    ("lat_long_selector",\
                                     lat_long_selector)])) \
                                    ,("Numpy2DFTransformer", \
                                      Numpy2DFTransformer(['lga','region',\
                                                           'ward','longitude',\
                                                           'latitude'])) \
                                    ,('LatitudeLongitudeProcess', \
                                      LatitudeLongitudeProcess(strategy="custom")) \
                                ])
#lat_long_transformer always return the 
#data in latitude, longitude order

#gps_height pipelines.
#This is dependent on the lat_long pipeline
gps_height_transformer = Pipeline([('gps_height_prep',\
                                    FeatureUnion(transformer_list=[('lat_long_transformer', \
                                                                   lat_long_transformer), \
                                   ('gps_selector',DataFrameSelector(['gps_height']))]))
                                   ,("Numpy2DFTransformer", \
                                      Numpy2DFTransformer(['latitude','longitude','gps_height']))
                                   ,('GPSHeightTransformer',GPSHeightTransformer(method='median'))
#                                    ,('GPSHeightTransformer',GPSHeightTransformer(method='custom'))
                                  ])
#population pipelines.
#This is dependent on the lat_long pipeline
population_transformer = Pipeline([('population_prep',\
                                    FeatureUnion(transformer_list=[('lat_long_transformer', \
                                                                   lat_long_transformer), \
                                   ('population_selector',DataFrameSelector(['population']))]))
                                   ,("Numpy2DFTransformer", \
                                      Numpy2DFTransformer(['latitude','longitude','population']))
                                   ,('PopulationTransformer',PopulationTransformer(method = 'ignore'))
                                   #,('PopulationTransformer',PopulationTransformer(method = 'custom')) #NOT worth
                                   # ,('PopulationTransformer',PopulationTransformer(method = 'median')) #NOT Worth
                                  ])

full_numeric_transformations = Pipeline([('all_numeric_transformations', \
                                        FeatureUnion(transformer_list = \
                                                     [('amount_tsh_transformer',amount_tsh_transformer), \
                                        ('lat_long_transformer',lat_long_transformer), \
                                        ('gps_height_transformer',gps_height_transformer), \
                                        ('population_transformer',population_transformer), \
                                        ('age_pipeline',age_pipeline) \
                                       ])) \
                                       ,('scaler',ScaleData())
                                        ])

all_transformations = Pipeline([ \
                                ('all_transformations', \
                                    FeatureUnion(transformer_list = \
                                                 [('full_categorical_pipeline',\
                                                   full_categorical_pipeline), \
                                                  ('full_numeric_transformations',\
                                                   full_numeric_transformations)])) \
                                 ])

predict_pipeline = Pipeline([('all_transformations',all_transformations), \
                             ('rf',RandomForestClassifier(n_estimators=5000,n_jobs=-1))])


In [22]:
X = afr.drop(['defective',"new_ids"],axis=1)
y = afr["defective"]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

predict_pipeline = Pipeline([('all_transformations',all_transformations), \
                             ('rf',RandomForestClassifier())])
params = {"rf__n_estimators": [500,1000,1500], "rf__max_depth": [10,13,15], "rf__n_jobs":[-1], "rf__class_weight": ["balanced",{"no":1,"yes":9}]}
score = make_scorer(error)

search = GridSearchCV(predict_pipeline, params, iid=False, cv=5, scoring=score, return_train_score=False)
search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [23]:
predict_pipeline = Pipeline([('all_transformations',all_transformations), \
                             ('rf',RandomForestClassifier(max_depth = 15, n_estimators = 1500,
                                                          n_jobs = -1, class_weight = {'no': 1, 'yes': 9}))])
predict_pipeline.fit(X_train,y_train)
y_train_predict = predict_pipeline.predict(X_train)
y_test_predict = predict_pipeline.predict(X_test)
# y_train_predict = [1 if i > 0.5 else 0 for i in y_train_predict]
# y_test_predict = [1 if i > 0.5 else 0 for i in y_test_predict]
print("train error = ", error_met(y_train, y_train_predict))
print("test error = ", error_met(y_test, y_test_predict))
print("train acc = ", accuracy_score(y_train, y_train_predict))
print("test acc = ", accuracy_score(y_test, y_test_predict))

train error =  0.06336194325180117
test error =  0.08089358944908057
train acc =  0.6425
test acc =  0.6189


In [24]:
y_train_predict = predict_pipeline.predict_proba(X_train)[:,1]
y_test_predict = predict_pipeline.predict_proba(X_test)[:,1]
y_train_predict = ["yes" if i > 0.5 else "no" for i in y_train_predict]
y_test_predict = ["yes" if i > 0.5 else "no" for i in y_test_predict]
print("train error = ", error_met(y_train, y_train_predict))
print("test error = ", error_met(y_test, y_test_predict))
print("train acc = ", accuracy_score(y_train, y_train_predict))
print("test acc = ", accuracy_score(y_test, y_test_predict))

train error =  0.06336194325180117
test error =  0.08089358944908057
train acc =  0.6425
test acc =  0.6189


In [25]:
rf_tr = y_train_predict
rf_val = y_test_predict

In [26]:
##final rf_model
predict_pipeline.fit(X,y)
test = afr_test.drop("new_ids",axis=1)
test = predict_pipeline.predict_proba(test)[:,1]
test = ["yes" if i > 0.50 else "no" for i in test]
rf_ts = test

## Method 2: Catboost

In [27]:
cat_columns = ['subvillage','lga','ward','basin','region','district_code',
                          'public_meeting','scheme_management',
                          'permit','extraction_type_group',
                          'management','payment_type',
                          'water_quality','quantity_group',
                          'source','waterpoint_type','funder','installer']

all_transformations = Pipeline([ \
                                ('all_transformations', \
                                    FeatureUnion(transformer_list = \
                                                 [('full_numeric_transformations',\
                                                   full_numeric_transformations)])) \
                                 ])

In [28]:
import catboost as ct

In [29]:
training_transformed = all_transformations.fit_transform(X_train,y_train)
all_columns = ['amount_tsh',"latitude",'longitude',
               'gps_height','population','age']
training_transformed = pd.DataFrame(training_transformed,columns = all_columns)

training_transformed[cat_columns] = X_train.reset_index()[cat_columns]
training_labels = y_train.map(lambda x: 0 if x == "no" else 1)

training_transformed.fillna("NaN",inplace=True)

test_transformed = all_transformations.transform(X_test)
test_transformed = pd.DataFrame(test_transformed,columns = all_columns)

test_transformed[cat_columns] = X_test.reset_index()[cat_columns]
test_labels = y_test.map(lambda x: 0 if x == "no" else 1)
test_transformed.fillna("NaN",inplace=True)

In [30]:
## for parameter selection
cat_features = list(range(6,24))
pool = ct.Pool(training_transformed, training_labels, cat_features)
s = []

for d in [9]:
    for trees in [250]:
        for lr in [0.1]:
            params = {"iterations": trees,
                    'depth': d,
                    'verbose': False,
                    "loss_function": "Logloss",
                    "task_type": 'GPU',
                     "learning_rate": lr,
                     "class_weights": [1,8],
                     "stratified":True,
                     "fold_count":5}
            scores = ct.cv(pool, params)
            s = [trees, d, lr, scores.iloc[-1,:].values]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ubuntu/.conda/envs/myenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-30-7e525438f186>", line 18, in <module>
    scores = ct.cv(pool, params)
  File "/home/ubuntu/.conda/envs/myenv/lib/python3.7/site-packages/catboost/core.py", line 2951, in cv
    as_pandas, max_time_spent_on_fixed_cost_ratio, dev_max_iterations_batch_size)
  File "_catboost.pyx", line 2665, in _catboost._cv
  File "_catboost.pyx", line 2683, in _catboost._cv
_catboost.CatboostError: catboost/cuda/cuda_lib/cuda_base.h:268: CUDA error 35: CUDA driver version is insufficient for CUDA runtime version

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.conda/envs/myenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2018, in showtraceback
    stb = value._render_t

CatboostError: catboost/cuda/cuda_lib/cuda_base.h:268: CUDA error 35: CUDA driver version is insufficient for CUDA runtime version

In [31]:
cat_features = list(range(6,24))
model = ct.CatBoostClassifier(iterations = 250, depth = 9, verbose = False, loss_function = "Logloss",
                              use_best_model = False, learning_rate = 0.1, class_weights = [1,8])
# Fit model
model.fit(training_transformed, training_labels, cat_features)
# Get predicted classes
y_train_predict = model.predict(training_transformed)
y_test_predict = model.predict(test_transformed)
print("train error = ", error_met(training_labels, y_train_predict))
print("test error = ", error_met(test_labels, y_test_predict))
print("train acc = ", accuracy_score(training_labels, y_train_predict))
print("test acc = ", accuracy_score(test_labels, y_test_predict))

train error =  0.05155352287606155
test error =  0.08177263357846798
train acc =  0.756725
test acc =  0.7016


In [32]:
y_train_predict = [1 if i > 0.4 else 0 for i in model.predict_proba(training_transformed)[:,1]]
y_test_predict = [1 if i > 0.4 else 0 for i in model.predict_proba(test_transformed)[:,1]]
print("train error = ", error_met(training_labels, y_train_predict))
print("test error = ", error_met(test_labels, y_test_predict))
print("train acc = ", accuracy_score(training_labels, y_train_predict))
print("test acc = ", accuracy_score(test_labels, y_test_predict))

train error =  0.055596726566498514
test error =  0.07679182989804885
train acc =  0.699675
test acc =  0.6558


In [33]:
ct_tr = y_train_predict
ct_val = y_test_predict

In [34]:
train = afr.drop("new_ids",axis=1)
train = all_transformations.transform(train)
train = pd.DataFrame(train,columns=all_columns)
train[cat_columns] = afr.reset_index()[cat_columns]
train.fillna("NaN",inplace=True)

model.fit(train, afr.defective.map(lambda x: 1 if x == "yes" else 0), cat_features)

test = afr_test.drop("new_ids",axis=1)
test = all_transformations.transform(test)
test = pd.DataFrame(test,columns=all_columns)
test[cat_columns] = afr_test.reset_index()[cat_columns]
test.fillna("NaN",inplace=True)
ct_ts = [1 if i > 0.4 else 0 for i in model.predict_proba(test)[:,1]]

In [35]:
# t = 0
# t = [t+1 if ((x == "yes") and (y == 1)) or ((x == "no") and (y == 0)) else t for x,y in zip(rf_ts,ct_ts)]


In [36]:
# t = [1 if x == "yes" else 0 for x in rf_ts]
# # (np.array(t)*np.array(ct_ts)).sum()
# c = 0
# for i in range(len(ct_ts)):
#     if ct_ts[i] == t[i]:
#         c+=1
# c

In [37]:
p1 = predict_pipeline.predict_proba(afr_test.drop("new_ids",axis=1))[:,1]
p2 = model.predict_proba(test)[:,1]
# l = [1 if p > 0.5 else 0 for p in (p1+p2)/2]

## Method 3: CatBoost with different imputations and features

In [38]:
raw_df = afr
test = afr_test

In [39]:
cat_columns_missing = ['funder', 'installer', 'subvillage', 'scheme_management', 'scheme_name']
bool_columns_missing = ['public_meeting', 'permit']

In [40]:
## Replacing missing in categorical by field "Missing"
for col in cat_columns_missing:
    raw_df[col] = raw_df[col].fillna("Missing")
    test[col] = test[col].fillna("Missing")

In [41]:
## Replacing missing in boolean columns by 99
raw_df[bool_columns_missing] *= 1
test[bool_columns_missing] *= 1

for col in bool_columns_missing:
    raw_df[col] = raw_df[col].fillna(99)
    test[col] = test[col].fillna(99)

In [42]:
## Getting year, month, day out of date recorded
date_col = ['date_recorded']

for col in date_col:
    raw_df[col] = pd.to_datetime(raw_df[col])
    test[col] = pd.to_datetime(test[col])
    
    raw_df[col+'year'] = raw_df[col].dt.year
    raw_df[col+'month'] = raw_df[col].dt.month
    raw_df[col+'day'] = raw_df[col].dt.day
    
    test[col+'year'] = test[col].dt.year
    test[col+'month'] = test[col].dt.month
    test[col+'day'] = test[col].dt.day

In [43]:
## Coverting target to 1,0
raw_df["defective"] = raw_df["defective"].map(lambda x: 1 if x=="yes" else 0)
test["defective"] = 0

In [44]:
## Converting num_private to boolean as 99% are 0's
raw_df["num_private"] = raw_df["num_private"].map(lambda x: 0 if x==0 else 1)
test["num_private"] = test["num_private"].map(lambda x: 0 if x==0 else 1)

In [45]:
## Derived column = construction year - recorded year
raw_df["construction_minus_recorded"] = raw_df["date_recordedyear"] - raw_df["construction_year"]
raw_df.loc[raw_df["construction_year"]==0,'construction_minus_recorded'] = 99999
test["construction_minus_recorded"] = test["date_recordedyear"] - test["construction_year"]

In [46]:
perc_95 = pd.DataFrame(raw_df.amount_tsh.describe(percentiles=[0.95])).iloc[5][0]
raw_df.loc[raw_df["amount_tsh"]>perc_95,'amount_tsh'] = perc_95

In [47]:
cols_to_drop = ['new_ids', 'region_code', 'district_code', 'recorded_by', 'construction_year', 
                'date_recordedyear', 'date_recordedmonth', 'date_recordedday', 'wpt_name'] + date_col
raw_df = raw_df.drop(cols_to_drop,axis=1)
test = test.drop(cols_to_drop,axis=1)

In [48]:
raw_df.columns

Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'num_private', 'basin', 'subvillage', 'region', 'lga',
       'ward', 'population', 'public_meeting', 'scheme_management',
       'scheme_name', 'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quality_group', 'quantity',
       'quantity_group', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group', 'defective',
       'construction_minus_recorded'],
      dtype='object')

In [49]:
X = raw_df.drop("defective", axis=1)
y = raw_df["defective"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [50]:
temp = pd.DataFrame(X_train.dtypes).reset_index()
cat_features = list(temp.index[temp[0]=="object"])

In [51]:
model = ct.CatBoostClassifier(iterations=250, 
                           depth=9, 
                           class_weights = [1,8],
                           learning_rate=0.1,
                           loss_function='Logloss')

model.fit(X_train, y_train,cat_features=cat_features)

preds_cat_val = model.predict(X_val)
preds_cat_train = model.predict(X_train)

#print(confusion_matrix(y_val, preds_cat))
print(error_met(y_val, preds_cat_val))
print(error_met(y_train, preds_cat_train))

0:	learn: 0.5895366	total: 203ms	remaining: 50.7s
1:	learn: 0.5259532	total: 374ms	remaining: 46.4s
2:	learn: 0.4663044	total: 606ms	remaining: 49.9s
3:	learn: 0.4320168	total: 845ms	remaining: 52s
4:	learn: 0.4037437	total: 940ms	remaining: 46.1s
5:	learn: 0.3771304	total: 1.19s	remaining: 48.5s
6:	learn: 0.3598724	total: 1.38s	remaining: 48s
7:	learn: 0.3445778	total: 1.63s	remaining: 49.2s
8:	learn: 0.3342532	total: 1.86s	remaining: 49.9s
9:	learn: 0.3258657	total: 2.11s	remaining: 50.7s
10:	learn: 0.3196463	total: 2.22s	remaining: 48.2s
11:	learn: 0.3113043	total: 2.47s	remaining: 49s
12:	learn: 0.3055328	total: 2.71s	remaining: 49.5s
13:	learn: 0.3001747	total: 2.95s	remaining: 49.8s
14:	learn: 0.2951696	total: 3.05s	remaining: 47.9s
15:	learn: 0.2908993	total: 3.29s	remaining: 48s
16:	learn: 0.2871501	total: 3.53s	remaining: 48.3s
17:	learn: 0.2844886	total: 3.8s	remaining: 49s
18:	learn: 0.2822904	total: 3.97s	remaining: 48.3s
19:	learn: 0.2806250	total: 4.12s	remaining: 47.3s
2

161:	learn: 0.2352546	total: 29s	remaining: 15.7s
162:	learn: 0.2349918	total: 29.2s	remaining: 15.6s
163:	learn: 0.2345400	total: 29.5s	remaining: 15.5s
164:	learn: 0.2344454	total: 29.7s	remaining: 15.3s
165:	learn: 0.2342619	total: 30s	remaining: 15.2s
166:	learn: 0.2340629	total: 30.3s	remaining: 15s
167:	learn: 0.2338317	total: 30.5s	remaining: 14.9s
168:	learn: 0.2334834	total: 30.8s	remaining: 14.7s
169:	learn: 0.2334626	total: 31s	remaining: 14.6s
170:	learn: 0.2332320	total: 31.3s	remaining: 14.4s
171:	learn: 0.2331167	total: 31.5s	remaining: 14.3s
172:	learn: 0.2330491	total: 31.7s	remaining: 14.1s
173:	learn: 0.2329220	total: 32s	remaining: 14s
174:	learn: 0.2326123	total: 32.2s	remaining: 13.8s
175:	learn: 0.2321452	total: 32.5s	remaining: 13.7s
176:	learn: 0.2318759	total: 32.8s	remaining: 13.5s
177:	learn: 0.2317455	total: 33s	remaining: 13.3s
178:	learn: 0.2316074	total: 33.3s	remaining: 13.2s
179:	learn: 0.2315217	total: 33.5s	remaining: 13s
180:	learn: 0.2314142	total:

In [71]:
y_train_predict = model.predict_proba(X_train)[:,1]
y_test_predict = model.predict_proba(X_val)[:,1]
y_train_predict = [1 if i > 0.4 else 0 for i in y_train_predict]
y_test_predict = [1 if i > 0.4 else 0 for i in y_test_predict]
print("train error = ", error_met(y_train, y_train_predict))
print("test error = ", error_met(y_val, y_test_predict))
print("train acc = ", accuracy_score(y_train, y_train_predict))
print("test acc = ", accuracy_score(y_val, y_test_predict))

train error =  0.0548474580194481
test error =  0.07710409873865429
train acc =  0.7029333333333333
test acc =  0.65256


In [52]:
model.fit(raw_df.drop("defective",axis=1), raw_df.defective,cat_features=cat_features)
p3 = model.predict_proba(test)[:,1]
ct2_ts = [1 if i > 0.4 else 0 for i in p3]

0:	learn: 0.5898661	total: 126ms	remaining: 31.4s
1:	learn: 0.5177157	total: 241ms	remaining: 29.8s
2:	learn: 0.4732768	total: 277ms	remaining: 22.8s
3:	learn: 0.4393138	total: 308ms	remaining: 19s
4:	learn: 0.4013645	total: 412ms	remaining: 20.2s
5:	learn: 0.3742588	total: 522ms	remaining: 21.2s
6:	learn: 0.3604675	total: 572ms	remaining: 19.9s
7:	learn: 0.3420366	total: 695ms	remaining: 21s
8:	learn: 0.3283501	total: 815ms	remaining: 21.8s
9:	learn: 0.3170553	total: 936ms	remaining: 22.5s
10:	learn: 0.3084364	total: 1.04s	remaining: 22.6s
11:	learn: 0.3049144	total: 1.07s	remaining: 21.2s
12:	learn: 0.2983574	total: 1.19s	remaining: 21.6s
13:	learn: 0.2920301	total: 1.31s	remaining: 22.1s
14:	learn: 0.2878740	total: 1.42s	remaining: 22.2s
15:	learn: 0.2854811	total: 1.46s	remaining: 21.3s
16:	learn: 0.2818350	total: 1.57s	remaining: 21.6s
17:	learn: 0.2782348	total: 1.69s	remaining: 21.8s
18:	learn: 0.2755050	total: 1.8s	remaining: 21.9s
19:	learn: 0.2731668	total: 1.92s	remaining: 2

162:	learn: 0.2025790	total: 19.1s	remaining: 10.2s
163:	learn: 0.2024872	total: 19.3s	remaining: 10.1s
164:	learn: 0.2022924	total: 19.4s	remaining: 9.99s
165:	learn: 0.2020293	total: 19.5s	remaining: 9.88s
166:	learn: 0.2017983	total: 19.6s	remaining: 9.76s
167:	learn: 0.2014788	total: 19.8s	remaining: 9.65s
168:	learn: 0.2010933	total: 19.9s	remaining: 9.53s
169:	learn: 0.2008900	total: 20s	remaining: 9.41s
170:	learn: 0.2008412	total: 20.1s	remaining: 9.29s
171:	learn: 0.2006006	total: 20.2s	remaining: 9.18s
172:	learn: 0.2004064	total: 20.4s	remaining: 9.06s
173:	learn: 0.2002010	total: 20.5s	remaining: 8.95s
174:	learn: 0.1998810	total: 20.6s	remaining: 8.84s
175:	learn: 0.1995788	total: 20.8s	remaining: 8.73s
176:	learn: 0.1994259	total: 20.9s	remaining: 8.62s
177:	learn: 0.1992435	total: 21s	remaining: 8.5s
178:	learn: 0.1990360	total: 21.1s	remaining: 8.38s
179:	learn: 0.1987461	total: 21.3s	remaining: 8.26s
180:	learn: 0.1985351	total: 21.4s	remaining: 8.15s
181:	learn: 0.198

In [103]:
r = [1 if i == "yes" else 0 for i in rf_ts]
c = [1 if i>=2 else 0 for i in (np.array(r) + np.array(ct_ts) + np.array(ct2_ts))]

In [108]:
# ((np.array(r) + np.array(ct_ts) + np.array(ct2_ts)) == 1).sum()
# (lab_act.defective == "no").sum()
(np.array(c) == 0).sum()

1591

In [111]:
(np.array(c) == np.array(ct2_ts)).sum()

4936

In [114]:
df = pd.DataFrame(columns=["id","defective"])
df.id = afr_test.new_ids
df.defective = ct2_ts
df.to_csv("F02691_F02155.csv",index=False)

In [115]:
# pd.read_csv("F02691_F02155.csv")

Unnamed: 0,id,defective
0,4168,1
1,24074,1
2,19227,1
3,44693,0
4,22751,1
5,51896,1
6,8556,1
7,48413,1
8,5853,0
9,36654,1
