# Load Items

In [1]:
import json
import pprint

import numpy as np
import scipy.stats as ss
import pandas as pd
pd.set_option('display.max_columns', 100)

import warnings 
warnings.filterwarnings('ignore')

import condense_csv

In [141]:
def get_data(data_path, label_path, valid_size=0.2):
    """
    Get the data, and split it into train and validation.

    :param data_path: path of the data input
    :type  data_path: str
    :param label_path: path of the data label
    :type  label_path: str
    :param test_size: Proportion to split on for the validation size
    :type  test_size: float
    """

    from sklearn.model_selection import train_test_split

    data = pd.read_csv(data_path)
    labels = pd.read_csv(label_path)
    data = condense_csv.compress_X(data)
    labels = condense_csv.compress_labels(labels)

    X_train, X_valid, y_train, y_valid = train_test_split(
        data,
        labels,
        test_size=valid_size,
        random_state=42
    )

    return X_train, X_valid, y_train, y_valid


train_X, valid_X, train_y, valid_y = get_data(
    '../data/train.csv', '../data/train_labels.csv')

memory used before preprocess:  19.00808

date time size before: 3.9798799999999996
date time size after:  0.47528 

converting funder                         size:  3.86	->	 0.33
converting installer                      size:  3.64	->	 0.34
converting basin                          size:  4.03	->	 0.06
converting subvillage                     size:  3.85	->	 2.03
converting region                         size:  3.78	->	 0.06
converting lga                            size:  3.83	->	 0.07
converting ward                           size:  3.83	->	 0.34
converting public_meeting                 size:  2.1	->	 0.06
converting recorded_by                    size:  4.75	->	 0.06
converting scheme_management              size:  3.55	->	 0.06
converting scheme_name                    size:  3.15	->	 0.4
converting permit                         size:  2.06	->	 0.06
converting extraction_type                size:  3.84	->	 0.06
converting extraction_type_group          size:  3.85	->	 0.06
con

In [4]:
train_X.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
3607,454,50.0,2013-02-27,Dmdd,2092,DMDD,35.42602,-4.227446,Narmo,0,Internal,Bashnet Kati,Manyara,21,1,Babati,Bashinet,160,True,GeoData Consultants Ltd,Water Board,,True,1998,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
50870,510,0.0,2011-03-17,Cmsr,0,Gove,35.510074,-5.724555,Lukali,0,Internal,Lukali,Dodoma,1,6,Bahi,Lamaiti,0,True,GeoData Consultants Ltd,VWC,,True,0,india mark ii,india mark ii,handpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
20413,14146,0.0,2011-07-10,Kkkt,0,KKKT,32.499866,-9.081222,Mahakama,0,Lake Rukwa,Chawalikozi,Mbeya,12,6,Mbozi,Ndalambo,0,True,GeoData Consultants Ltd,VWC,,False,0,other,other,other,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other
52806,47410,0.0,2011-04-12,,0,,34.060484,-8.830208,Shule Ya Msingi Chosi A,0,Rufiji,Shuleni,Mbeya,12,7,Mbarali,Chimala,0,True,GeoData Consultants Ltd,VWC,,True,0,gravity,gravity,gravity,vwc,user-group,pay monthly,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
50091,1288,300.0,2011-04-05,Ki,1023,Ki,37.03269,-6.040787,Kwa Mjowe,0,Wami / Ruvu,Ngholong,Morogoro,5,1,Kilosa,Chakwale,120,True,GeoData Consultants Ltd,VWC,,True,1997,other,other,other,vwc,user-group,pay when scheme fails,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other


In [5]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 3607 to 56422
Data columns (total 40 columns):
id                       47520 non-null int32
amount_tsh               47520 non-null float64
date_recorded            47520 non-null datetime64[ns]
funder                   44644 non-null category
gps_height               47520 non-null int16
installer                44631 non-null category
longitude                47520 non-null float64
latitude                 47520 non-null float64
wpt_name                 47520 non-null object
num_private              47520 non-null int16
basin                    47520 non-null category
subvillage               47224 non-null category
region                   47520 non-null category
region_code              47520 non-null int8
district_code            47520 non-null int8
lga                      47520 non-null category
ward                     47520 non-null category
population               47520 non-null int16
public_meeting           

In [6]:
def convert_num_labels_to_cat (df, categories):
    """
    Some columns have integer values, but do not denote any order, 
    so we need to convert them into categories.
    
    :param df: dataframe
    :type  df: pandas.core.frame.DataFrame
    :param categories: list of column names to convert
    :type  categories: list
    """
    
    for cat in categories:
        df[cat] = df[cat].astype('category')
    
    return df

train_X = convert_num_labels_to_cat(train_X, ['region_code', 'district_code'])

In [9]:
# dictz = {
#     'a': [1, 2, 'b'],
#     'd': [5, 6, 11]
# }

# with open('test.json', 'w') as outfile:
#     json.dump (dictz, outfile)
    
# The initial conditions of the class
file = open('starting_info.json')
starting_info = json.load(file)
file.close()
pd.DataFrame(starting_info)

Unnamed: 0,amount_tsh,basin,construction_year,date_recorded,district_code,extraction_type,extraction_type_class,extraction_type_group,funder,gps_height,id,installer,latitude,lga,longitude,management,management_group,num_private,payment,payment_type,permit,population,public_meeting,quality_group,quantity,quantity_group,recorded_by,region,region_code,scheme_management,scheme_name,source,source_class,source_type,subvillage,ward,water_quality,waterpoint_type,waterpoint_type_group,wpt_name
0,[binning_5],[drop],"[median0, shiftmin]",[finddays],[],[drop],[],[drop],[cbinning_50],[absneg],[drop],[cbinning_100],[drop0],[],[drop0],[],[drop],[drop],[drop],[],[mostcommon],[outlierZ_3],[mostcommon],[],[drop],[],[drop],[],[drop],[mostcommon],[drop],[],[drop],[drop],[drop],[drop],[drop],[drop],[],[drop]
1,Large outliers. Can also try log or box cox,99% corr with region and region has higher cor...,year of 0 does not make sense,Need to convert to days.,,100% corr with extraction_type_group and extra...,,100% corr with extraction_type_group and class...,1900 categories,No way to have negative numbers. Might want to...,All IDs are unique.,2145 categories,Zero latitude is not in tanzania,might want to try dropping it since it has hig...,Zero longitude is not in tanzania,,5% corr with label and 100% corr with management,No description on it and has low correlation w...,100% corr with payment_type,,might want to try dropping because 3% corr wit...,has huge outliers. Might also want to try binn...,Might want to try dropping this since it has 7...,,100% corr with quantity group,,there's only 1 value,,100% corr with region,,"2700 categories, 20000 (2/5 of data) are missi...",,"7% corr with label, and trinary with low freq ...",100% corr with source,way too many categories and is represented by ...,"high corr with lga, has many categories. Try b...",100% corr with quality_group and has lower cor...,100% corr with water_type_group and more noisy,,"Intuitively, there should be no relavancy"


In [10]:
class ColumnOperations:
    """
    A class for representing column operations to be performed for the dataframe
    """
    
    def __init__(self, name, operations, justifications):
        """
        :param name: column name
        :type  name: str
        :param operations: a sequence of operations to be performed on the column
        :type  operations: list
        :param justification: justification for operation
        :type  justification: str
        """
        
        self.name = name
        self.operations = operations
        self.justifications = justifications

In [11]:
def create_col_instances(col_dict):
    """
    Create a list of instances of the column operation class
    
    :param col_dict: dictionary where keys are column names and values are the class attributes
    :type  col_dict: dict
    :returns: list of class instances
    :rtype:   list
    """
    
    col_instances = []
    for col_name, attributes in col_dict.items():
        col_instances.append(ColumnOperations(col_name, attributes[0], attributes[1]))
    return col_instances
        
list_of_classes = create_col_instances(starting_info)
#pprint.pprint (list_of_classes)

In [139]:
def perform_operations(df, col_name, operations):
    """
    Execute operations on a certain column in the dataframe.
                                Operations:      Description:
        Numerics
                                drop             drop the entire column
                                log              perform log transformation on the column
                                box cox          perform box cox transformation on the column
                                drop0            drop all values with zeros in it
                                absneg           absolute value the negatives
                                median0          replace 0 with the median
                                binning_NUM      create NUM amount of bins
                                outlierZ_NUM     remove outliers with z score > NUM
                                shiftmin         subtract the columns by the minimum value

        Datetime
                                finddays         convert datetime to days since the first day

        Categorical/Object
                                cbinning_NUM     create bins where each bin must have occurences of NUM or higher
                                mostcommon       replace nan with most common category

    :param df: dataframe 
    :type  df: pandas.core.frame.DataFrame
    :param col_name: name of column
    :type  col_name: str
    :param operations: list of operations to perform on the certain column
    :type  operations: list
    :returns: transformed dataframe 
    :rtype:   pandas.core.frame.DataFrame
    """

    col = df[col_name]

    for operation in operations:
        if operation == 'drop':
            return df.drop(col_name, axis=1)

        elif str(col.dtype) in {'int8', 'int16', 'int32', 'float64'}:
            if operation == 'log':
                col = np.log(1 + col)  # to make sure no divide by zero
            elif operation == 'box cox':
                col = ss.boxcox(col + 0.001)  # to make sure no divide by zero
            elif operation == "drop0":
                df = df[col != 0]
                col = col[col != 0]
            elif operation == "absneg":
                col = col.abs()
            elif operation == "median0":
                from sklearn.preprocessing import Imputer
                col[col == 0] = np.nan
                imputer = Imputer(strategy="median")
                col = imputer.fit_transform(col.values.reshape(-1, 1))
            elif operation.split('_')[0] == 'binning':
                num = int(operation.split('_')[1])
                quantile_list = [i / (num - 1) for i in range(num)]
                print(quantile_list)
                print(quantile_list[1:])
                
                df["DROP_" + col_name] = pd.qcut(
                    col,
                    q=quantile_list,
                    duplicates='raise',
                )
                col = pd.qcut(
                    col,
                    q=quantile_list,
                    duplicates='raise',
                    labels=quantile_list[1:]
                )
            elif operation.split('_')[0] == 'outlierZ':
                z = np.abs(ss.zscore(col))
                keep_values = z < float(operation.split('_')[1])
                df = df[keep_values]
                col = col[keep_values]
            elif operation == "shiftmin":
                col = col - col.min()
            else:
                raise ValueError('Not an available operation for numerics')

        elif str(col.dtype) in {'datetime64[ns]'}:
            if operation == "finddays":
                col = (col - min(col)).dt.days

        elif str(col.dtype) in {'category', 'object'}:
            if operation.split('_')[0] == "cbinning":
                num = float(operation.split('_')[1])
                value_counts = col.value_counts()
                x = col.replace(value_counts)
                df[col_name][df[col_name] == '0'] = np.nan
                df[col_name] = df[col_name].cat.add_categories(['OTHER'])
                df[col_name] = df[col_name].fillna('OTHER')
                df.loc[x < num, col_name] = 'OTHER'
                return df
            elif operation == "mostcommon":
                most_common = col.value_counts().index[0]
                col = col.fillna(most_common)
            else:
                raise ValueError(
                    'Not an available operation for categoricals or objects')

        else:
            raise ValueError('Not an available data type')

    df[col_name] = col

    return df

In [110]:
def perform_operations_with_classes(df, list_of_classes):
    for col_class in list_of_classes:
        df = perform_operations(df, col_class.name, col_class.operations)
    return df
    
#testing_all = perform_operations_with_classes(train_X, list_of_classes)

In [127]:
train_X.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
3607,454,a,2013-02-27,Dmdd,2092,DMDD,35.42602,-4.227446,Narmo,0,Internal,Bashnet Kati,Manyara,21,1,Babati,Bashinet,160,True,GeoData Consultants Ltd,Water Board,,True,1998,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
50870,510,a,2011-03-17,Cmsr,0,Gove,35.510074,-5.724555,Lukali,0,Internal,Lukali,Dodoma,1,6,Bahi,Lamaiti,0,True,GeoData Consultants Ltd,VWC,,True,0,india mark ii,india mark ii,handpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
20413,14146,a,2011-07-10,Kkkt,0,KKKT,32.499866,-9.081222,Mahakama,0,Lake Rukwa,Chawalikozi,Mbeya,12,6,Mbozi,Ndalambo,0,True,GeoData Consultants Ltd,VWC,,False,0,other,other,other,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other
52806,47410,a,2011-04-12,,0,,34.060484,-8.830208,Shule Ya Msingi Chosi A,0,Rufiji,Shuleni,Mbeya,12,7,Mbarali,Chimala,0,True,GeoData Consultants Ltd,VWC,,True,0,gravity,gravity,gravity,vwc,user-group,pay monthly,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
50091,1288,e,2011-04-05,Ki,1023,Ki,37.03269,-6.040787,Kwa Mjowe,0,Wami / Ruvu,Ngholong,Morogoro,5,1,Kilosa,Chakwale,120,True,GeoData Consultants Ltd,VWC,,True,1997,other,other,other,vwc,user-group,pay when scheme fails,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other


In [142]:
#perform_operations(train_X, 'id', ['drop']).head()
#perform_operations(train_X, 'amount_tsh', ['outlierZ_3'])#.head()
#perform_operations(train_X, 'amount_tsh', ['median0'])#.head()
#test = perform_operations(train_X, 'gps_height', ['absneg'])#.head()
#test = perform_operations(train_X, 'population', ['outlierZ'])#.head()
#test = perform_operations(train_X, 'construction_year', ['median0', 'shiftmin'])#.head()
#test = perform_operations(train_X, 'public_meeting', ["mostcommon"])#.head()
#test['public_meeting'].isnull().any()
test = perform_operations(train_X, 'amount_tsh', ['binning_2']).head()
test

[0.0, 1.0]
[1.0]


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,DROP_amount_tsh
3607,454,1.0,2013-02-27,Dmdd,2092,DMDD,35.42602,-4.227446,Narmo,0,Internal,Bashnet Kati,Manyara,21,1,Babati,Bashinet,160,True,GeoData Consultants Ltd,Water Board,,True,1998,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,"(-0.001, 350000.0]"
50870,510,1.0,2011-03-17,Cmsr,0,Gove,35.510074,-5.724555,Lukali,0,Internal,Lukali,Dodoma,1,6,Bahi,Lamaiti,0,True,GeoData Consultants Ltd,VWC,,True,0,india mark ii,india mark ii,handpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,"(-0.001, 350000.0]"
20413,14146,1.0,2011-07-10,Kkkt,0,KKKT,32.499866,-9.081222,Mahakama,0,Lake Rukwa,Chawalikozi,Mbeya,12,6,Mbozi,Ndalambo,0,True,GeoData Consultants Ltd,VWC,,False,0,other,other,other,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other,"(-0.001, 350000.0]"
52806,47410,1.0,2011-04-12,,0,,34.060484,-8.830208,Shule Ya Msingi Chosi A,0,Rufiji,Shuleni,Mbeya,12,7,Mbarali,Chimala,0,True,GeoData Consultants Ltd,VWC,,True,0,gravity,gravity,gravity,vwc,user-group,pay monthly,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,"(-0.001, 350000.0]"
50091,1288,1.0,2011-04-05,Ki,1023,Ki,37.03269,-6.040787,Kwa Mjowe,0,Wami / Ruvu,Ngholong,Morogoro,5,1,Kilosa,Chakwale,120,True,GeoData Consultants Ltd,VWC,,True,1997,other,other,other,vwc,user-group,pay when scheme fails,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other,"(-0.001, 350000.0]"


In [137]:
def drop_col_with_DROP(df):
    """
    Drop columns that have a name of DROP_XXX.
    
    :param df: dataframe
    :type  df: pandas.core.frame.DataFrame
    :returns: dataframe of dropped columns
    :rtype:   pandas.core.frame.DataFrame
    """
    
    for col in df.columns:
        if col.split('_')[0] == 'DROP':
            df = df.drop([col], axis=1)
    return df

In [138]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaning(BaseEstimator, TransformerMixin):
    """
    Class for performing data cleaning, so the same can be applied to the test case
    """
    
    def __init__(self, class_list=list_of_classes):
        self.class_list = class_list
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = drop_col_with_DROP(X)
        X_transformed = perform_operations_with_classes(X, self.class_list)
        return X_transformed

In [77]:
trans = DataCleaning()
testing_all = trans.transform(train_X)

[0.0, 0.25, 0.5, 0.75, 1.0]




In [78]:
testing_all.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,region,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,management,payment_type,quality_group,quantity_group,source,waterpoint_type_group
3607,"(20.0, 350000.0]",3339,Dmdd,2092,OTHER,35.42602,-4.227446,Manyara,1,Babati,160,True,Water Board,True,38.0,gravity,water board,per bucket,good,insufficient,spring,communal standpipe
50870,"(-0.001, 20.0]",2626,Cmsr,0,Gove,35.510074,-5.724555,Dodoma,6,Bahi,0,True,VWC,True,40.0,handpump,vwc,never pay,good,enough,shallow well,hand pump
20413,"(-0.001, 20.0]",2741,Kkkt,0,KKKT,32.499866,-9.081222,Mbeya,6,Mbozi,0,True,VWC,False,40.0,other,vwc,never pay,good,enough,shallow well,other
52806,"(-0.001, 20.0]",2652,OTHER,0,OTHER,34.060484,-8.830208,Mbeya,7,Mbarali,0,True,VWC,True,40.0,gravity,vwc,monthly,good,insufficient,river,communal standpipe
50091,"(20.0, 350000.0]",2645,Ki,1023,OTHER,37.03269,-6.040787,Morogoro,1,Kilosa,120,True,VWC,True,37.0,other,vwc,on failure,salty,enough,shallow well,other


In [79]:
testing_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45517 entries, 3607 to 56422
Data columns (total 22 columns):
amount_tsh               45517 non-null category
date_recorded            45517 non-null int64
funder                   45517 non-null category
gps_height               45517 non-null int16
installer                45517 non-null category
longitude                45517 non-null float64
latitude                 45517 non-null float64
region                   45517 non-null category
district_code            45517 non-null category
lga                      45517 non-null category
population               45517 non-null int16
public_meeting           45517 non-null category
scheme_management        45517 non-null category
permit                   45517 non-null category
construction_year        45517 non-null float64
extraction_type_class    45517 non-null category
management               45517 non-null category
payment_type             45517 non-null category
quality_group     

In [80]:
def split_cat_num (df):
    """
    Split the dataframe into two dataframes, 
    where one contains the numeric datatypes 
    and the other contains categorical datatypes.
    
    :param df: dataframe
    :type  df: pandas.core.frame.DataFrame
    :returns: (df with numeric dtypes, df with categorical dtypes)
    :rtype:   (pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)
    """
    
    col_names = set (df.columns)
    types = set ([str(dtype) for dtype in df.dtypes.values])
    
    num_cols = df.select_dtypes(include=['int8', 'int16', 'int32', 'int64', 'float64'])
    cat_cols = df.select_dtypes(include=['category'])
    
    num_col_names = set (num_cols.columns)
    cat_col_names = set (cat_cols.columns)
    
    missing_col_names = col_names.difference(num_col_names).difference(cat_col_names)
    
    if len (missing_col_names) != 0: 
        print ("Columns Missing:", missing_col_names)
    
    return list (num_cols.columns), list (cat_cols.columns)
    
num_cols, cat_cols = split_cat_num(testing_all)

In [81]:
cat_cols

['amount_tsh',
 'funder',
 'installer',
 'region',
 'district_code',
 'lga',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type_class',
 'management',
 'payment_type',
 'quality_group',
 'quantity_group',
 'source',
 'waterpoint_type_group']

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", OneHotEncoder(), cat_cols),
    ])

In [85]:
test = full_pipeline.fit_transform(testing_all)
test

TypeError: int() argument must be a string, a bytes-like object or a number, not 'pandas._libs.interval.Interval'

In [60]:
#full_pipeline.fit_transform(train_X)

In [74]:
testing_all

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,region,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,management,payment_type,quality_group,quantity_group,source,waterpoint_type_group
3607,"(20.0, 350000.0]",3339,Dmdd,2092,OTHER,35.426020,-4.227446,Manyara,1,Babati,160,True,Water Board,True,38.0,gravity,water board,per bucket,good,insufficient,spring,communal standpipe
50870,"(-0.001, 20.0]",2626,Cmsr,0,Gove,35.510074,-5.724555,Dodoma,6,Bahi,0,True,VWC,True,40.0,handpump,vwc,never pay,good,enough,shallow well,hand pump
20413,"(-0.001, 20.0]",2741,Kkkt,0,KKKT,32.499866,-9.081222,Mbeya,6,Mbozi,0,True,VWC,False,40.0,other,vwc,never pay,good,enough,shallow well,other
52806,"(-0.001, 20.0]",2652,OTHER,0,OTHER,34.060484,-8.830208,Mbeya,7,Mbarali,0,True,VWC,True,40.0,gravity,vwc,monthly,good,insufficient,river,communal standpipe
50091,"(20.0, 350000.0]",2645,Ki,1023,OTHER,37.032690,-6.040787,Morogoro,1,Kilosa,120,True,VWC,True,37.0,other,vwc,on failure,salty,enough,shallow well,other
16521,"(-0.001, 20.0]",2770,Hesawa,0,DWE,33.509112,-2.648505,Mwanza,2,Magu,0,True,VWC,True,40.0,handpump,vwc,never pay,salty,enough,shallow well,hand pump
52225,"(-0.001, 20.0]",3341,World Vision,0,World vision,33.731347,-3.284633,Shinyanga,2,Maswa,0,True,WUG,False,40.0,handpump,wug,other,good,seasonal,shallow well,hand pump
9440,"(-0.001, 20.0]",2630,OTHER,298,OTHER,36.864072,-7.935517,Morogoro,3,Kilombero,250,True,VWC,True,49.0,handpump,vwc,never pay,good,insufficient,shallow well,hand pump
41885,"(-0.001, 20.0]",2764,Government Of Tanzania,0,Government,33.423658,-2.606991,Mwanza,2,Magu,0,True,VWC,True,40.0,motorpump,vwc,never pay,good,enough,machine dbh,communal standpipe
54042,"(-0.001, 20.0]",3303,Government Of Tanzania,1141,DWE,30.381136,-4.640729,Kigoma,2,Kasulu,1520,True,Water authority,False,49.0,handpump,vwc,never pay,good,enough,shallow well,hand pump


In [86]:
trial = testing_all.drop(['amount_tsh'], axis=1)

In [88]:
tnum_cols, tcat_cols = split_cat_num(trial)
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, tnum_cols),
        ("cat", OneHotEncoder(), tcat_cols),
    ])
ttest = full_pipeline.fit_transform(testing_all)

In [91]:
ttest.toarray().shape

(45517, 428)

In [96]:
"cat".categories_

AttributeError: 'str' object has no attribute 'categories_'

In [108]:
z = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
z[1:]

[0.2, 0.4, 0.6, 0.8, 1.0]