In [28]:
import pathlib
this_path = pathlib.Path().absolute()
data_path = this_path.parent / "data"

import custom_transformers as ct
import data_functions

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [4]:
def get_dataframes():
    '''
    function to retrieve the data for this project as dataframes
    
    --returns:
    a tuple containing pandas dataframes in the format (x_train, x_test, y_train)
    '''
    x_train_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv'
    x_test_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Test_set_values.csv'
    y_train_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv'
    
    x_train = open_local_csv(x_train_filename)
    x_test = open_local_csv(x_test_filename)
    y_train = open_local_csv(y_train_filename)
    
    return (x_train, x_test, y_train)

def open_local_csv(filename):
    '''
    checks that the csv filepath exists for given filename and returns a dataframe containing its
    values if it does exist
    
    --parameters:
    
    filename: should be a string containing the name of the csv to be opened
    
    --returns:
    
    pandas DataFrame object if csv_path exists, else prints error msg and returns None
    '''
    
    csv_path = data_path / filename
    if csv_path.exists():
        return pd.read_csv(csv_path, index_col = 'id')
    else:
        print(f'the specified filepath does not exist: {csv_path}')
        return None

In [29]:
x_train, x_test, y_test = data_functions.get_dataframes()

In [5]:
x_tr, x_te, y_tr = get_dataframes()

In [6]:
x_tr

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,Pangani,...,per bucket,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,Rufiji,...,annually,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe
37057,0.0,2011-04-11,,0,,34.017087,-8.750434,Mashine,0,Rufiji,...,monthly,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump
31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,Rufiji,...,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump


making functions to return lists of features for strict/loose and categorical/numerical

In [12]:
not_strict_features = ['date_recorded', 'funder', 'installer', 'wpt_name', 'subvillage', 'recorded_by', 'scheme_name', 'extraction_type',
             'extraction_type_class', 'payment', 'management', 'management_group', 'source_class',
             'waterpoint_type_group','latitude','longitude','num_private','region_code','district_code']
strict_features = [x for x in x_tr.columns if x not in not_strict_features ]

print(strict_features)

['amount_tsh', 'gps_height', 'basin', 'region', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'waterpoint_type']


In [16]:
def get_strict_features():
    strict_features = ['amount_tsh', 'gps_height', 'installer', 'basin', 'region',
                            'lga', 'population', 'construction_year', 'extraction_type_group', 'payment_type',
                            'quality_group', 'quantity', 'source_type', 'waterpoint_type']
    return strict_features

def get_loose_features():
    loose_features = ['amount_tsh', 'gps_height', 'installer', 'basin', 'region', 'lga', 'ward', 'population',
                      'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group',
                      'payment_type', 'water_quality', 'quantity', 'source', 'waterpoint_type']
    return loose_features

def get_numeric_features(f_names):
    numeric = ['amount_tsh', 'population', 'construction_year', 'gps_height']
    num_features = [x for x in f_names if x not in numeric]
    return numeric_features

def get_categorical_features(f_names):
    categorical = ['installer', 'basin', 'region', 'lga', 'ward',
                   'public_meeting', 'scheme_management', 'permit','extraction_type_group',
                   'payment_type', 'water_quality', 'quantity', 'source',
                   'waterpoint_type', 'source_type', 'quality_group']
    cat_features = [x for x in f_names if x not in categorical]
    return cat_features

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [22]:
class BinInstaller(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
        
    def transform(self, X):
        data = self._to_df(X)
        others = data['installer'].value_counts().index[data['installer'].value_counts() < 10]
        is_other = lambda x: 'Other' if x in others else x
        data['installer'] = data['installer'].map(is_other)
        return data

    
class TransformConstructionYear(TransformerMixin, BaseEstimator):
    def __init__(self, cy):
        self.current_year = cy
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
    
    def _bin_data(self, x):
        if x == 0:
            return None 
        else:
            return self.current_year - x 
        
    def transform(self, X):
        data = self._to_df(X)
        data['construction_year'] = data['construction_year'].map(self._bin_data)
        return data

In [24]:
con_year = x_tr['construction_year']

In [27]:
bi = ct.BinInstaller()
bi.transform(x_tr)

NameError: name 'np' is not defined