# Load Items

In [1]:
import json
import pandas as pd
import numpy as np
import condense_csv
import pprint
import scipy.stats as ss

In [2]:
train_X = pd.read_csv('../data/train.csv')
train_y = pd.read_csv('../data/train_labels.csv')  
train_X = condense_csv.compress_X(train_X)
train_y = condense_csv.compress_labels(train_y)

memory used before preprocess:  19.00808

date time size before: 3.9798799999999996
date time size after:  0.47528 

converting funder                         size:  3.86	->	 0.33
converting installer                      size:  3.64	->	 0.34
converting basin                          size:  4.03	->	 0.06
converting subvillage                     size:  3.85	->	 2.03
converting region                         size:  3.78	->	 0.06
converting lga                            size:  3.83	->	 0.07
converting ward                           size:  3.83	->	 0.34
converting public_meeting                 size:  2.1	->	 0.06
converting recorded_by                    size:  4.75	->	 0.06
converting scheme_management              size:  3.55	->	 0.06
converting scheme_name                    size:  3.15	->	 0.4
converting permit                         size:  2.06	->	 0.06
converting extraction_type                size:  3.84	->	 0.06
converting extraction_type_group          size:  3.85	->	 0.06
con

In [3]:
train_X.info()#.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
id                       59400 non-null int32
amount_tsh               59400 non-null float64
date_recorded            59400 non-null datetime64[ns]
funder                   55765 non-null category
gps_height               59400 non-null int16
installer                55745 non-null category
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int16
basin                    59400 non-null category
subvillage               59029 non-null category
region                   59400 non-null category
region_code              59400 non-null int8
district_code            59400 non-null int8
lga                      59400 non-null category
ward                     59400 non-null category
population               59400 non-null int16
public_meeting           560

In [4]:
# dictz = {
#     'a': [1, 2, 'b'],
#     'd': [5, 6, 11]
# }

# with open('test.json', 'w') as outfile:
#     json.dump (dictz, outfile)
    
# The initial conditions of the class
file = open('starting_info.json')
starting_info = json.load(file)
file.close()
starting_info
#pd.DataFrame(starting_info)

{'amount_tsh': [['binning_5'], 'Large outliers. Can also try log or box cox'],
 'date_recorded': [['finddays'], 'Need to convert to days.'],
 'funder': [['cbinning_50'], '1900 categories'],
 'id': [['drop'], 'All IDs are unique.']}

In [5]:
class ColumnOperations:
    """
    A class for representing column operations to be performed for the dataframe
    """
    
    def __init__(self, name, operations, justifications):
        """
        :param name: column name
        :type  name: str
        :param operations: a sequence of operations to be performed on the column
        :type  operations: list
        :param justification: justification for operation
        :type  justification: str
        """
        
        self.name = name
        self.operations = operations
        self.justifications = justifications

In [6]:
def create_col_instances(col_dict):
    """
    Create a list of instances of the column operation class
    
    :param col_dict: dictionary where keys are column names and values are the class attributes
    :type  col_dict: dict
    :returns: list of class instances
    :rtype:   list
    """
    
    col_instances = []
    for col_name, attributes in col_dict.items():
        col_instances.append(ColumnOperations(col_name, attributes[0], attributes[1]))
    return col_instances
        
list_of_classes = create_col_instances(starting_info)
pprint.pprint (list_of_classes)

[<__main__.ColumnOperations object at 0x1a1925b5f8>,
 <__main__.ColumnOperations object at 0x1a1925b550>,
 <__main__.ColumnOperations object at 0x1a1925b4e0>,
 <__main__.ColumnOperations object at 0x1a1925b4a8>]


In [7]:
#x[0].justifications

def perform_operations(df, col_name, operations):
    """
    Execute operations on a certain column in the dataframe.
        Operations:      Description:
        drop             drop the entire column
        log              perform log transformation on the column
        box cox          perform box cox transformation on the column
        binning_NUM      create NUM amount of bins
        outlierZ_NUM     remove outliers with z score > NUM
        drop0            drop all values with zeros in it
        median0          replace 0 with the median
        finddays         convert datetime to days since the first day
        cbinning_NUM     create bins where each bin must have occurences of NUM or higher
    
    :param df: dataframe 
    :type  df: pandas.core.frame.DataFrame
    :param col_name: name of column
    :type  col_name: str
    :param operations: list of operations to perform on the certain column
    :type  operations: list
    :returns: transformed dataframe 
    :rtype:   pandas.core.frame.DataFrame
    """
    
    col = df[col_name]
    
    for operation in operations:
        if operation == 'drop':
            return df.drop(col_name, axis=1)
        
        elif str (col.dtype) in {'int8', 'int16', 'int32', 'float64'}:
            if operation == 'log':
                col = np.log(1 + col) # to make sure no divide by zero
            elif operation == 'box cox':
                col = ss.boxcox(col + 0.001) # to make sure no divide by zero
            elif operation == "drop0":
                df = df[col != 0]
                col = col[col != 0]
            elif operation == "median0":
                from sklearn.preprocessing import Imputer
                col[col == 0] = np.nan
                imputer = Imputer(strategy="median")
                col = imputer.fit_transform(col.values.reshape(-1, 1))
                #print (col)
            elif operation.split('_')[0] == 'binning':
                num = int (operation.split('_')[1])
                quantile_list = [i  / (num - 1) for i in range (num )]
                print (quantile_list)
                col = pd.qcut(
                    col, 
                    q=quantile_list,
                    duplicates='drop'
                )
            elif operation.split('_')[0] == 'outlierZ':
                z = np.abs(ss.zscore(col))
                keep_values = z < float (operation.split('_')[1])
                df = df[keep_values]
                col = col[keep_values]
            else:
                raise ValueError('Not an available operation')
        
        elif str(col.dtype) in {'datetime64[ns]'}:
            if operation == "finddays":
                col = (col - min(col)).dt.days
        elif str (col.dtype) in {'category', 'object'}:
            #print ('Did not do this yet', col)
            if operation.split('_')[0] == "cbinning":
                num = float (operation.split('_')[1])
                value_counts = col.value_counts()
                x = col.replace(value_counts)
                df[col_name][df[col_name] == '0'] = np.nan
                df[col_name] = df[col_name].cat.add_categories(['OTHER'])
                df[col_name] = df[col_name].fillna('OTHER')
                df.loc[x < num, col_name] = 'OTHER'
                return df
                
        else:
            raise ValueError('Not an available data type')
    
    df[col_name] = col 
    
    return df

In [8]:
def perform_operations_with_classes(df, list_of_classes):
    for col_class in list_of_classes:
        df = perform_operations(df, col_class.name, col_class.operations)
    return df
    
#perform_operations_with_classes(train_X, list_of_classes)

In [9]:
#perform_operations(train_X, 'id', ['drop']).head()
#perform_operations(train_X, 'amount_tsh', ['binning_6']).head()
#perform_operations(train_X, 'amount_tsh', ['outlierZ_3'])#.head()
#perform_operations(train_X, 'amount_tsh', ['median0'])#.head()
#perform_operations(train_X, 'funder', ['cbinning_50'])#.head()

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaning(BaseEstimator, TransformerMixin):
    """
    Class for performing data cleaning, so the same can be applied to the test case
    """
    
    def __init__(self, class_list=list_of_classes):
        self.class_list = class_list
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = perform_operations_with_classes(X, self.class_list)
        return X_transformed

In [11]:
trans = DataCleaning()
xxx = trans.transform(train_X)

[0.0, 0.25, 0.5, 0.75, 1.0]


In [12]:
xxx#.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,"(20.0, 350000.0]",3073,Roman,1390,Roman,34.938093,-9.856322e+00,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,"(-0.001, 20.0]",3796,Grumeti,1399,GRUMETI,34.698766,-2.147466e+00,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,"(20.0, 350000.0]",3787,OTHER,686,World vision,37.460664,-3.821329e+00,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,"(-0.001, 20.0]",3759,Unicef,263,UNICEF,38.486161,-1.115530e+01,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,"(-0.001, 20.0]",3194,OTHER,0,Artisan,31.130847,-1.825359e+00,Shuleni,0,Lake Victoria,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
5,"(-0.001, 20.0]",3072,Mkinga Distric Coun,0,DWE,39.172796,-4.765587e+00,Tajiri,0,Pangani,...,per bucket,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe
6,"(-0.001, 20.0]",3640,Dwsp,0,DWSP,33.362410,-3.766365e+00,Kwa Ngomho,0,Internal,...,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump
7,"(-0.001, 20.0]",3648,Rwssp,0,DWE,32.620617,-4.226198e+00,Tushirikiane,0,Lake Tanganyika,...,unknown,milky,milky,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
8,"(-0.001, 20.0]",3673,Wateraid,0,Water Aid,32.711100,-5.146712e+00,Kwa Ramadhan Musa,0,Lake Tanganyika,...,never pay,salty,salty,seasonal,seasonal,machine dbh,borehole,groundwater,hand pump,hand pump
9,"(-0.001, 20.0]",3215,OTHER,0,Artisan,30.626991,-1.257051e+00,Kwapeto,0,Lake Victoria,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump


In [13]:
train_X.columns
#str (train_X['date_recorded'].dtype) # 'datetime64[ns]'

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [14]:
#train_X