In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
import pandas_profiling

In [2]:
#read in data
local = '../data/tanzania/'

train = pd.merge(pd.read_csv(local+'train_features.csv'),
                 pd.read_csv(local+'train_labels.csv'))
test = pd.read_csv(local+'test_features.csv')
sample_submission = pd.read_csv(local+'sample_submission.csv')
#split train data into train and validation
train, val = train_test_split(train, train_size=0.8, test_size = 0.2,
                              stratify = train['status_group'], random_state=42)
train.shape, val.shape

((47520, 41), (11880, 41))

In [3]:
def Wrangle(X):
    #This function will help us handle strange or missing values
    X = X.copy()
    
    #The latitude column has a very values extremely close to zero, we're going to fix that
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    #When we have 0's and know we shouldn't, it's best to replace them with nans
    #In this case, we'll use the column mean to replace them
    cols_with_zeros = ['construction_year', 'longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col] = X[col].fillna(X[col].mean())
    
    #Convert date recorded to datetime format
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format = True)
    
    #We have some columns that are duplicates, so we'll drop one of them here
    X = X.drop(columns = 'quantity_group')
    
    #Now we replace missing categorical values with the MISSING category
    categoricals = X.select_dtypes(exclude = 'number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
        
    return X

train = Wrangle(train)
val = Wrangle(val)
test = Wrangle(test)

In [4]:
train.profile_report()

KeyboardInterrupt: 

In [5]:
#Separate training set into features and target
target = 'status_group'
#dropping id, target, and redundant features. Source_type was chosen because it was identical to source
#they are just grouped differently. Waterpoint_type_group was chosen for the same reason
train_features = train.drop(columns =[target,'id', 'payment_type','source_type', 'waterpoint_type_group'] )
#List of the numeric features
numeric_features = train_features.select_dtypes(include = 'number').columns.tolist()
#Examine cardinality of categorical features to discover encoding candidates
cardinality = train_features.select_dtypes(exclude = 'number').nunique()

recorded_by                  1
public_meeting               3
permit                       3
source_class                 3
management_group             5
quantity                     5
quality_group                6
payment                      7
extraction_type_class        7
waterpoint_type              7
water_quality                8
basin                        9
source                      10
management                  12
scheme_management           13
extraction_type_group       13
extraction_type             18
region                      21
lga                        124
date_recorded              349
funder                    1717
installer                 1930
ward                      2082
scheme_name               2564
subvillage               17232
wpt_name                 30661
dtype: int64

In [6]:
#Get a list of all categories with a cardinality of 21 or less
categorical_features = cardinality[cardinality<=21].index.tolist()

#create our feature list
features = categorical_features + numeric_features

In [7]:
#Arrange into X_features and y_target matrices
X_train = train[features]
X_val = val[features]
X_test = test[features]

y_train = train[target]
y_val = val[target]

#perform encoding and scaling of features
encoder = ce.OneHotEncoder(use_cat_names = True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [8]:
#Fit to decision tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

#Score on validation set
print(f'Validation Accuracy Score: {dt.score(X_val_scaled, y_val)}')

Validation Accuracy Score: 0.7560606060606061


In [42]:
X_train.columns

Index(['basin', 'region', 'public_meeting', 'recorded_by', 'scheme_management',
       'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'water_quality', 'quality_group', 'quantity', 'source', 'source_class',
       'waterpoint_type', 'amount_tsh', 'gps_height', 'longitude', 'latitude',
       'num_private', 'region_code', 'district_code', 'population',
       'construction_year'],
      dtype='object')

In [27]:
#Try ordinal encoding on a few features
or_dicts = [{'col':'quality_group', 'mapping': {'good':5,'fluoride':4, 'salty':3,'colored':2,'milky':1,'unknown':0}},
           {'col':'quantity','mapping': {'enough':4,'seasonal':3,'insufficient':2,'dry':1,'unknown':0}}]

or_encoder = ce.OrdinalEncoder(cols = ['quality_group','quantity'],mapping = or_dicts)
X_train_or = or_encoder.fit_transform(X_train)
X_val_or = or_encoder.transform(X_val)


In [43]:
encoder_2 = ce.OneHotEncoder(use_cat_names = True, cols = ['basin', 'region', 'public_meeting', 'recorded_by', 
                                                           'scheme_management','permit', 'extraction_type',
                                                           'extraction_type_group','extraction_type_class', 
                                                           'management', 'management_group', 'payment','water_quality', 
                                                           'source', 'source_class','waterpoint_type', 'amount_tsh', 
                                                           'gps_height', 'longitude', 'latitude','num_private', 'region_code', 
                                                           'district_code', 'population','construction_year'])

In [45]:
#onehotencode remaining categories and scale data
X_train_or_oh = encoder_2.fit_transform(X_train_or)
X_val_or_oh = encoder_2.transform(X_val_or)
X_train_or_scaled = scaler.fit_transform(X_train_or_oh)
X_val_or_scaled = scaler.transform(X_val_or_oh)

MemoryError: 

In [47]:
X_train_or_oh['quantity']

KeyError: 'quantity'

In [41]:
#Fit to random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', max_depth = 50)

rfc.fit(X_train_or_scaled, y_train)

#Score on validation set
print(f'Validation Accuracy Score:{rfc.score(X_val_or_scaled,y_val)}')

Validation Accuracy Score:0.8047138047138047


In [None]:
dt_pred = dt.predict(X_test_scaled)

submission = sample_submission.copy()
submission['status_group'] = dt_pred
submission.to_csv('Submission-01.csv', index = False)