In [1]:
import pandas as pd
import numpy as np
import datetime

# preprocessing imports
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from category_encoders import TargetEncoder
import category_encoders as ce
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score

pd.options.mode.chained_assignment = None

## Helpers

In [2]:
def date_parser(df):
    date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
                             df['date_recorded'].values))
    df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
    df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
    df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
    df['age'] = df['year_recorder'].values - df['construction_year'].values
    del df['date_recorded']
    return df

def make_mi_scores(X, y):
    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name="MI_SCORES", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def target_encode_multiclass(X_train, y, X_test, target_encoded_cols):
    
    y=y.astype(str)
    
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns 
    
    X_obj=X_train.select_dtypes('object') 
    X_train=X_train.select_dtypes(exclude='object')
    
    X_test_obj=X_test.select_dtypes('object')
    X_test=X_test.select_dtypes(exclude='object')
    
    for class_ in class_names:
        enc=ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) 
        
        X_train_temp=enc.transform(X_obj)       
        X_train_temp.columns=[str(x)+'_'+str(class_) for x in X_train_temp.columns]
        
        X_test_temp=enc.transform(X_test_obj)      
        X_test_temp.columns=X_train_temp.columns
        
        target_encoded_cols.extend(list(X_train_temp.columns))
        X_train=pd.concat([X_train, X_train_temp],axis=1)    
        X_test=pd.concat([X_test, X_test_temp],axis=1)    
      
    return X_train, X_test, target_encoded_cols

## Dataset

In [3]:
# training data
training_features_df = pd.read_csv('dataset/training_features.csv', index_col='id')
training_labels_df = pd.read_csv('dataset/training_labels.csv', index_col='id')

# testing data
testing_features_df = pd.read_csv('dataset/test_features.csv', index_col='id')

In [4]:
status_group_mappings = {'functional': 1, 'non functional':2, 'functional needs repair': 3}
training_labels_df['status_group'] = training_labels_df['status_group'].map(status_group_mappings)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(training_features_df, 
                                                    training_labels_df,
                                                    test_size=0.3, random_state=42)
datasets = [X_train, X_test]

In [6]:
training_features_df = training_features_df.join(training_labels_df)


## Pre-Processing

### 1. create new features using `date_recorded` and `construction_year`

In [7]:
for dataset in datasets:
    dataset = date_parser(dataset)

training_features_df = date_parser(training_features_df)
testing_features_df = date_parser(testing_features_df)

In [8]:
numeric_cols = [ 'amount_tsh',
                 'gps_height',
                 'longitude', 
                 'latitude',
                 'region_code',
                 'district_code',
                 'population',
                 'yearly_week_recorder',
                 'month_recorder',
                 'age'
               ]

### 2. numeric features are scaled and imputed with `mean`

In [9]:
# training and testing features
for col in numeric_cols:
    scaler = StandardScaler()
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    sclaer = scaler.fit(X_train[[col]])
    for dataset in datasets:
        dataset[col] = scaler.transform(dataset[[col]]).ravel()
                
    imputer= imputer.fit(X_train[[col]])
    for dataset in datasets:
        dataset[col] = imputer.transform(dataset[[col]]).ravel()

# submission features
for col in numeric_cols:
    scaler = StandardScaler()
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    sclaer = scaler.fit(training_features_df[[col]])
    training_features_df[col] = scaler.transform(training_features_df[[col]]).ravel()
    testing_features_df[col]  = scaler.transform(testing_features_df[[col]]).ravel()
                
    imputer= imputer.fit(training_features_df[[col]])
    training_features_df[col] = scaler.transform(training_features_df[[col]]).ravel()
    testing_features_df[col]  = scaler.transform(testing_features_df[[col]]).ravel()
    

### 3. create a new feature `distance` using `latitude` and `longitude`

In [10]:
# training and testing features
X_train['distance'] = (X_train.latitude**2 + X_train.longitude**2)**0.5
X_test['distance'] = (X_test.latitude**2 + X_test.longitude**2)**0.5

# submission features 
training_features_df['distance'] = (training_features_df.latitude**2 + training_features_df.longitude**2)**0.5
testing_features_df['distance'] = (testing_features_df.latitude**2 + testing_features_df.longitude**2)**0.5

### 4. create a new feature `angle` using `latitude` and `longitude`

In [11]:
# training and testing features
X_train["angle"] = np.arctan(X_train["latitude"]/X_train["longitude"])
X_test["angle"] = np.arctan(X_test["latitude"]/X_test["longitude"])


training_features_df["angle"] = np.arctan(training_features_df["latitude"]/training_features_df["longitude"])
testing_features_df["angle"] = np.arctan(testing_features_df["latitude"]/testing_features_df["longitude"])

### 5. PCA for `latitude` and `longitude`

In [12]:
# training and testing features
pca = PCA().fit(X_train[['latitude', 'longitude']])
X_train['distance_pca0'] = pca.transform(X_train[['latitude', 'longitude']])[:, 0]
X_train['distance_pca1'] = pca.transform(X_train[['latitude', 'longitude']])[:, 1]
X_test['distance_pca0'] = pca.transform(X_test[['latitude', 'longitude']])[:, 0]
X_test['distance_pca1'] = pca.transform(X_test[['latitude', 'longitude']])[:, 1]

pca1 = PCA().fit(training_features_df[['latitude', 'longitude']])
training_features_df['distance_pca0'] = pca1.transform(training_features_df[['latitude', 'longitude']])[:, 0]
training_features_df['distance_pca1'] = pca1.transform(training_features_df[['latitude', 'longitude']])[:, 1]
testing_features_df['distance_pca0'] = pca1.transform(testing_features_df[['latitude', 'longitude']])[:, 0]
testing_features_df['distance_pca1'] = pca1.transform(testing_features_df[['latitude', 'longitude']])[:, 1]

In [13]:
pca_cols = ['distance', 'distance_pca0', 'distance_pca1', 'angle' ]

In [14]:
cols_to_ohe = ['quantity','management_group','source_class']

In [15]:
ohe_after_cols = []

### 6. categorical features used to ohe are imputed with `mode`

In [16]:
# training and testing features
for col in cols_to_ohe:
    imputer = SimpleImputer(strategy='most_frequent')
            
    imputer= imputer.fit(X_train[[col]])
    for dataset in datasets:
        dataset[col] = imputer.transform(dataset[[col]]).ravel()

# submission features
for col in cols_to_ohe:
    imputer = SimpleImputer(strategy='most_frequent')
            
    imputer= imputer.fit(training_features_df[[col]])
    training_features_df[col] = imputer.transform(training_features_df[[col]]).ravel()
    testing_features_df[col]  = imputer.transform(testing_features_df[[col]]).ravel()

### 7. ohe categorical features

In [17]:
# training and testing features
ohe =OneHotEncoder(handle_unknown='ignore', sparse=False)
        
train_cols = pd.DataFrame(ohe.fit_transform(X_train[cols_to_ohe]))
test_cols = pd.DataFrame(ohe.transform(X_test[cols_to_ohe]))

ohe_after_cols = train_cols.columns.values

train_cols.index = X_train.index
test_cols.index = X_test.index
    
X_train = X_train.drop(cols_to_ohe, axis=1)
X_test = X_test.drop(cols_to_ohe, axis=1)
    
X_train = pd.concat([X_train, train_cols], axis=1)
X_test = pd.concat([X_test, test_cols], axis=1)

# submission features
ohe =OneHotEncoder(handle_unknown='ignore', sparse=False)
        
train_cols = pd.DataFrame(ohe.fit_transform(training_features_df[cols_to_ohe]))
test_cols = pd.DataFrame(ohe.transform(testing_features_df[cols_to_ohe]))

ohe_after_cols = train_cols.columns.values

train_cols.index = training_features_df.index
test_cols.index = testing_features_df.index
    
training_features_df = training_features_df.drop(cols_to_ohe, axis=1)
testing_features_df = testing_features_df.drop(cols_to_ohe, axis=1)
    
training_features_df = pd.concat([training_features_df, train_cols], axis=1)
testing_features_df = pd.concat([testing_features_df, test_cols], axis=1)

In [18]:
ordinal_columns = [ 'basin',  
                   'payment',
                   'payment_type',
                   'permit', 
                   'quantity_group',
                   'water_quality',
                   'quality_group',
                   'region', 
                   'extraction_type_group', 
                   'extraction_type',
                   'source',
                   'source_type',
                   'waterpoint_type',
                   'waterpoint_type_group',
                   'scheme_management',
                   'subvillage',
                   'ward',
                   'wpt_name'
                 ]

### 8. categorical features used to ordinal are imputed with `mode`

In [19]:
# training and testing features
for col in ordinal_columns:
    mode = X_train[col].mode()[0]
    X_test[col] = X_test[col].fillna(mode)
    X_train[col] = X_train[col].fillna(mode)

# submission features
for col in ordinal_columns:
    mode = X_train[col].mode()[0]
    training_features_df[col] = training_features_df[col].fillna(mode)
    testing_features_df[col] = testing_features_df[col].fillna(mode)

### 9. ordinal categorical features

In [20]:
# training and testing features
ordinal_encoder = OrdinalEncoder(handle_unknown='ignore')
X_train[ordinal_columns] = ordinal_encoder.fit_transform(X_train[ordinal_columns])
X_test[ordinal_columns] = ordinal_encoder.transform(X_test[ordinal_columns])

# submission features
final_ordinal_encoder = OrdinalEncoder(handle_unknown='ignore')
training_features_df[ordinal_columns] = final_ordinal_encoder.fit_transform(training_features_df[ordinal_columns])
testing_features_df[ordinal_columns] = final_ordinal_encoder.transform(testing_features_df[ordinal_columns])

In [21]:
final_target_encoded_cols = []

In [22]:
target_encoder_columns = ['lga', 'installer', 'funder','extraction_type_class', 'management']

### 10. categorical features used to target encode are imputed with `constant`

In [23]:
# training and testing features
for col in target_encoder_columns:
    X_test[col] = X_test[col].fillna('missing_value')
    X_train[col] = X_train[col].fillna('missing_value')

# submission features
for col in target_encoder_columns:
    training_features_df[col] = training_features_df[col].fillna('missing_value')
    testing_features_df[col] = testing_features_df[col].fillna('missing_value')

### 11. Target Encode categorical features

In [24]:
# training and testing features
X_tar_train, X_tar_test, final_target_encoded_cols = target_encode_multiclass(X_train[target_encoder_columns], 
                                                   y_train.status_group, 
                                                   X_test[target_encoder_columns],
                                                   final_target_encoded_cols)

X_train = pd.concat([X_train, X_tar_train], axis=1)
X_test = pd.concat([X_test, X_tar_test], axis=1)

X_train = X_train.drop(target_encoder_columns, axis=1)
X_test = X_test.drop(target_encoder_columns, axis=1)

# submission features
full_dataset_encoded_columns = []

X_tar_train, X_tar_test, full_dataset_encoded_columns = target_encode_multiclass(
                                                   training_features_df[target_encoder_columns], 
                                                   training_labels_df.status_group, 
                                                   testing_features_df[target_encoder_columns],
                                                   full_dataset_encoded_columns)

training_features_df = pd.concat([training_features_df, X_tar_train], axis=1)
testing_features_df = pd.concat([testing_features_df, X_tar_test], axis=1)

training_features_df = training_features_df.drop(target_encoder_columns, axis=1)
testing_features_df = testing_features_df.drop(target_encoder_columns, axis=1)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [25]:
# final column list
all_columns = final_target_encoded_cols + numeric_cols + list(ohe_after_cols) + ordinal_columns + pca_cols

## Feature Selection

In [26]:
# generate MI Scores

scores = make_mi_scores(X_train[all_columns], y_train.status_group)

In [27]:
selected_columns = []
for inx in list(scores.index):
#     print(inx, scores.at[inx])
    if(scores.at[inx] > 0.0001):
        selected_columns.append(inx)

## Model Creation

In [28]:
rf_model = RandomForestClassifier(n_estimators=1300,
                                  random_state=42,
                                  max_features='auto', 
                                  max_depth=60, 
                                  min_samples_split=6,
                                  min_samples_leaf=3,
                                  bootstrap=False )

In [29]:
%%time

rf_model.fit(X_train[selected_columns], y_train.status_group)

Wall time: 4min 27s


RandomForestClassifier(bootstrap=False, max_depth=60, min_samples_leaf=3,
                       min_samples_split=6, n_estimators=1300, random_state=42)

In [30]:
# prediction 
predic = rf_model.predict(X_test[selected_columns]).transpose()
accuracy_score(y_test.status_group, predic)

0.8127384960718294

### Submission

In [31]:
mi_scores = make_mi_scores(training_features_df[all_columns], training_features_df.status_group)

final_columns = []
for inx in list(mi_scores.index):
#     print(inx, mi_scores.at[inx])
    if(mi_scores.at[inx] > 0.0006):
        final_columns.append(inx)

In [32]:
# final model training

# rf_model.fit(training_features_df[final_columns], training_features_df['status_group'])

In [33]:
# prediction = rf_model.predict(testing_features_df[final_columns]).transpose()

In [34]:
# status_group_re_mappings = {1: 'functional', 2 : 'non functional', 3: 'functional needs repair'}
# output = pd.DataFrame({'id': testing_features_df.index, 'status_group': prediction })
# output['status_group'] = output.status_group.map(status_group_re_mappings)
# output.to_csv('final_submission.csv', index=False)