In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [2]:
def feature_selection_transformation(X, y, target, ordinal_feature):
    # check for numeric features train_data
    num_features = []
    cat_features = []
    for feature in X:
        if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
            num_features.append(feature)
        else:
            cat_features.append(feature)
    
    X = X.reset_index()
    # impute using only numerical features
    imp = IterativeImputer(max_iter = 10, random_state = 42)
    imp.fit(X[num_features])
    X[num_features] = imp.transform(X[num_features])
    
    # impute using only categorical features
    imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    X[cat_features] = imp.fit_transform(X[cat_features].astype(str))
     
    # split dataframe into numeric and categorical data
    X_num = X.drop(cat_features, axis = 1)
    X_cat = X.drop(num_features, axis = 1)
    
    # saleprice correlation matrix
    k_num = round(len(X_num.columns) / 2)
    corrmat = X_num.corr()
    X_num_fs = corrmat.nlargest(k_num, target)[target].index
    
    # check for multicollinearity
    # if two features are strongly correlated with each other (>= 0.7) 
    # the feature with the lower correlation with the target variable is dropped
    multicorr = {}
    k = len(corrmat)
    for feature in corrmat:
        i = 1
        if feature != target:
            while i < k - 1:
                if corrmat[feature][i] >= 0.7 and feature != corrmat.index[i]:
                    multicorr[feature] = corrmat.index[i], corrmat[feature][i]
                i = i + 1
        
    # delete duplicates
    corr_scores = []
    for feature in list(multicorr.keys()):
        if multicorr[feature][1] in corr_scores:
            del multicorr[feature]
        else:
            corr_scores.append(multicorr[feature][1])
            
    # remove the feature with the lower correlation coefficient (pearson)
    dropped_features = [] 
    for feature1, feature2 in multicorr.items():
        if corrmat[target][feature1] < corrmat[target][feature2[0]]:
            dropped_features.append(feature1)
        else:
            dropped_features.append(feature2[0])

    # drop the features from X_num dataframe
    for feature in X_num:
        if feature in dropped_features:
            X_num = X_num.drop(feature, axis = 1) 
    X_num.drop(X_num.columns.difference(X_num_fs), 1, inplace = True)
    
    # encode ordinal features (dummy variables)
    ord_features = pd.Series(list(ordinal_feature))
    if ord_features.isin(X_num):
        ord_data = [ordinal_feature]
        X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)
    
    # encode categorical features
    enc = OrdinalEncoder()
    enc.fit(X_cat)
    X_cat_enc = enc.transform(X_cat)
    
    # feature selection on categorical data
    k_cat = round(len(X_cat.columns) / 2)
    fs = SelectKBest(f_regression, k_cat)
    fs.fit(X_cat_enc, y) # save!!
    X_cat_fs = fs.transform(X_cat_enc)
    X_cat_enc = pd.DataFrame(X_cat_fs)
    
    # concatenate numerical and categorical features
    df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
    df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
    X = pd.concat([df_cat, df_num], axis = 1, sort = False)
    X = X.drop([target], axis = 1)
    return X

In [4]:
def predict_randomforestclass(X, y):
    # build the model (RandomForestClassifier)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    rf = RandomForestClassifier(n_estimators = 100, max_depth = None) 
    rf.fit(X_train, y_train)
    
    # quantifying the quality of prediction
    y_predict = rf.predict(X_test)
    acc_score = accuracy_score(y_test, y_predict)
    
    ret_stmt = 'Accuracy Score: ' + str(acc_score)
    return ret_stmt

In [13]:
def predict():
    train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"))
    
    # select features and target variable
    target = 'Survived'
    ordinal_feature = 'Pclass'
    
    features = list(train_data)
    X = train_data[features]
    y = train_data[target]

    feature_engineering = feature_selection_transformation(X, y, target, ordinal_feature)
    prediction = predict_randomforestclass(feature_engineering, y)
    return feature_engineering         

In [14]:
predict()

KeyError: "None of [Index(['Pclass'], dtype='object')] are in the [columns]"