# Lab Unit 7 | Random Forests

## Import libraries

In [146]:
#Standard libraries for data analysis:
    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm, skew, shapiro
from scipy import stats
import statsmodels.api as sm
import re #regex

# sklearn modules for data preprocessing:
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#sklearn modules for Model Selection:
from sklearn import svm, tree, linear_model, neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV


#Standard libraries for data visualization:
import seaborn as sns
from scipy.stats import boxcox 
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib 
%matplotlib inline
color = sns.color_palette()
import matplotlib.ticker as mtick
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve


pd.set_option('display.max_columns', None)

# Functions

## X/Y Split 

In [147]:
def x_y(df):
    X = df.drop(['TARGET_B','TARGET_D'], axis = 1)
    y = df['TARGET_B']
    return X,y

## Train / Test Split

In [148]:
def train_test(X,y,n):
      
    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n)
    
    return X_train, X_test, y_train, y_test


## Categorical vs Numerical features

In [149]:
def Xnum_Xcat(X_train, X_test):
    
    #split into numerical and categorical
    X_train_cat = X_train.select_dtypes(include = object)
    X_train_num =X_train.select_dtypes(include = np.number)

    X_test_cat = X_test.select_dtypes(include = object)
    X_test_num =X_test.select_dtypes(include = np.number)
    
    display(X_train_cat.shape)
    display(X_train_num.shape)
    display(X_test_cat.shape)
    display(X_test_num.shape)
    
    return X_train_cat, X_train_num, X_test_cat, X_test_num

## Scaling

In [150]:
def scaling_encoding(X_num,X_cat):
    

    # MinMaxScale numerical features to ensure that all variables are on the same scale
    scaler = MinMaxScaler().fit(X_num)

    X_num_scaled = scaler.transform(X_num)
    X_num_scaled = pd.DataFrame(X_num_scaled)
    X_num_scaled.columns = X_num.columns
    X_num_scaled.reset_index(drop = True, inplace = True)
    
    
    
    # Encode to ensure that all variables are on the same scale

    encoder = OneHotEncoder(drop='first').fit(X_cat)
    
    cols = encoder.get_feature_names_out(input_features=X_cat.columns)
    X_cat_encoded = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)
    X_cat_encoded.reset_index(drop = True, inplace = True)


    
    return X_num_scaled, X_cat_encoded

In [151]:
def concat_df(df1,df2):
    
    # Combine two dataframes, one next to other
    X_normalized = pd.concat([df1,df2], axis = 1)
    
    return X_normalized

## Feature Selection

#### Variance Threshold Feature Selection

In [152]:
# to look at VarianceThresholds we need all the variable to be on the same scale
# Variance Threshold Feature Selection only works with numerical data, encoding categorical data is required 

In [153]:

def variance_threshold(var_threshold,df):
    
    # use the estimator with a low threshold (minimum value of variance we want in out dataset)
    selection = VarianceThreshold(threshold=(var_threshold))

    # Fit
    selection = selection.fit(df)
    
    # Subset the DataFrame
    data_variance = selection.transform(df)
    data_variance = pd.DataFrame(data_variance)
    
    # Get list of features removed
    var_list = list(selection.get_support())
    
    drop_columns = [col[0] for col in zip(df.columns, var_list) if col[1] == False]
    
    return drop_columns

In [154]:
def drop_features(df, original_df, drop_columns):
    
    df_cleaned = df.drop(drop_columns, axis = 1)
    
    print('Original set: ', original_df.shape)
    print('New set: ', df_cleaned.shape)
    
    return df_cleaned

#### Kbest

In [155]:
def KBest(X, target, k):
    
    K_best = SelectKBest(chi2, k=k).fit_transform(X, target)
    
    
    # feature extraction
    model = SelectKBest(chi2, k=k).fit(X, target)
    df = pd.DataFrame(data = model.scores_, columns=['score'])
    df['column_name']= X.columns

    # summarize selected features
    display(df.sort_values(by = ['score'],ascending = False).head(k))
    
    # Add columns to drop to a list
    cols = df.sort_values(by = ['score'],ascending = False).head(k)['column_name']
    drop_columns = list(cols)
    
    
    return drop_columns

#### Recursive Feature Elimination

In [182]:
# Recursive Feature Elimination

def RFE_regression(model,n,X,y):
    
    # define the method
    method = model(max_iter = 500)
    rfe = RFE(method, n_features_to_select=n, verbose=False)
    
    # fit the model
    rfe.fit(X, y)
    
    # Summarize features
    df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
    df['Column_name'] = pd.DataFrame(X).columns
    df = df[df['Rank']==1]
    
    # Create a list with columns to keep
    columns_lst = list(df['Column_name'].values)
    
    
    return columns_lst
    

In [157]:
def keep_features(X, original_df, columns_lst):
    # Show only most important features
    df2 = X[[*columns_lst]]
    
    
    print('Original set: ',original_df.shape) # to be adjust according to the project
    print('RFE set: ',df2.shape)
    
    return df2

## Build the model

In [158]:
def forest(X_train,y_train,X_test,y_test):
    # Init, fit, score
    forest = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)

    _ = forest.fit(X_train, y_train)


    # Training Score
    print(f"Training Score: {forest.score(X_train, y_train)}")

    print(f"Test Score: {forest.score(X_test, y_test)}")

    
    # Make predictions / confusion_matrix
    y_pred = forest.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    
    
    # Evaluate the model
    print("precision: ",precision_score(y_test,y_pred))
    print("recall: ",recall_score(y_test,y_pred))
    print("f1: ",f1_score(y_test,y_pred))

# LAB

## Load dataset

csv

In [159]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [160]:
data = pd.concat([numerical, categorical, target], axis = 1)
data.shape

(95412, 339)

## X/Y Split

In [161]:
X,y = x_y(data)


## Train / Test Split

In [162]:
X_train, X_test, y_train, y_test = train_test(X,y,0.25)

## Imbalanced data

In [163]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [164]:
# Create full df with training data
train_set = pd.concat([X_train,y_train], axis = 1)


#### Upsampling

In [165]:
no_donate = train_set[train_set['TARGET_B']==0]
yes_donate = train_set[train_set['TARGET_B']==1]

In [166]:
from sklearn.utils import resample

yes_donate_oversampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)


display(no_donate.shape)
display(yes_donate_oversampled.shape)

(67942, 338)

(67942, 338)

In [167]:
# Create a full df again
oversampled_target = pd.concat([no_donate,yes_donate_oversampled], axis = 0)


In [168]:
# scrambled rows to avoid yes/no clusters
oversampled_total = oversampled_target.sample(frac=1)

# X/y split after upsampling
X_train_oversampled = oversampled_total.drop(['TARGET_B'], axis = 1)

y_train_oversampled =oversampled_total['TARGET_B']

## Categorical vs Numerical features

In [169]:
X_train_cat, X_train_num, X_test_cat, X_test_num = Xnum_Xcat(X_train_oversampled,X_test)


(135884, 7)

(135884, 330)

(23853, 7)

(23853, 330)

## Scaling

In [170]:
X_train_num_scaled, X_train_cat_scaled = scaling_encoding(X_train_num, X_train_cat)


In [171]:
X_test_num_scaled, X_test_cat_scaled = scaling_encoding(X_test_num, X_test_cat)

In [172]:
X_train_normalized = concat_df(X_train_num_scaled, X_train_cat_scaled)

In [173]:
X_test_normalized = concat_df(X_test_num_scaled, X_test_cat_scaled)

## Feature Selection

#### Variance Threshold Feature Selection

In [174]:
drop_columns = variance_threshold(0.02,X_train_normalized)
#drop_columns

Remove columns with low variance

In [175]:
X_train_normalized_cleaned = drop_features(X_train_normalized, X_train, drop_columns)

Original set:  (71559, 337)
New set:  (135884, 114)


In [176]:
X_test_normalized_cleaned = drop_features(X_test_normalized, X_test, drop_columns)

Original set:  (23853, 337)
New set:  (23853, 114)


#### KBest

In [178]:
drop_columns = KBest(X_train_normalized_cleaned,y_train_oversampled,25)

Unnamed: 0,score,column_name
77,1294.551818,RFA_2F
106,894.011855,RFA_2A_G
104,607.801459,RFA_2A_E
87,431.772841,LASTDATE_YR
105,382.165575,RFA_2A_F
27,187.213922,HVP1
28,181.808943,HVP2
29,134.741516,HVP3
32,129.231271,HVP6
30,99.709859,HVP4


Drop columns with the k lowest scores.

In [179]:
X_train_normalized_cleaned_2 = drop_features(X_train_normalized_cleaned, X_train, drop_columns)
X_train_normalized_cleaned_2.shape

Original set:  (71559, 337)
New set:  (135884, 89)


(135884, 89)

In [180]:
X_test_normalized_cleaned_2 = drop_features(X_test_normalized_cleaned, X_test, drop_columns)
X_test_normalized_cleaned_2.shape

Original set:  (23853, 337)
New set:  (23853, 89)


(23853, 89)

#### Recursive Feature Elimination

In [183]:
columns_lst = RFE_regression(LogisticRegression,50,X_train_normalized_cleaned_2,y_train_oversampled)


In [184]:
X_train_normalized_cleaned_RFE = keep_features(X_train_normalized_cleaned_2, X_train, columns_lst)


Original set:  (71559, 337)
RFE set:  (135884, 50)


In [185]:
X_test_normalized_cleaned_RFE = keep_features(X_test_normalized_cleaned_2, X_test, columns_lst)

Original set:  (23853, 337)
RFE set:  (23853, 50)


## Build the model

In [190]:
# Full dataset - all features

forest(X_train_normalized,y_train_oversampled,X_test_normalized,y_test) 

Training Score: 0.6240764181213387
Test Score: 0.7137466985284869
[[16490  6137]
 [  691   535]]
precision:  0.08018585131894485
recall:  0.4363784665579119
f1:  0.13547733603443912


In [191]:
# Dataset after Variance Tr

forest(X_train_normalized_cleaned,y_train_oversampled,X_test_normalized_cleaned,y_test) 

Training Score: 0.6267919696211475
Test Score: 0.611663103173605
[[13885  8742]
 [  521   705]]
precision:  0.07462686567164178
recall:  0.5750407830342578
f1:  0.13210906024547925


In [192]:
# Dataset after Kbest

forest(X_train_normalized_cleaned_2,y_train_oversampled,X_test_normalized_cleaned_2,y_test) 

Training Score: 0.6109107768390686
Test Score: 0.6029011025866767
[[13786  8841]
 [  631   595]]
precision:  0.06305637982195846
recall:  0.48531810766721045
f1:  0.11161132995685612


In [193]:
forest(X_train_normalized_cleaned_RFE,y_train_oversampled,X_test_normalized_cleaned_RFE,y_test) 

Training Score: 0.6050381207500515
Test Score: 0.6028591791388924
[[13800  8827]
 [  646   580]]
precision:  0.061656213458063144
recall:  0.4730831973898858
f1:  0.10909432897582996
