# Lab | Random Forests






# Import Dependecies & Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
categorical = pd.read_csv("files_for_lab/categorical.csv")
numerical = pd.read_csv("files_for_lab/numerical.csv")
target = pd.read_csv("files_for_lab/target.csv")

# Data Exploration

In [3]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   STATE         95412 non-null  object
 1   CLUSTER       95412 non-null  int64 
 2   HOMEOWNR      95412 non-null  object
 3   GENDER        95412 non-null  object
 4   DATASRCE      95412 non-null  int64 
 5   RFA_2R        95412 non-null  object
 6   RFA_2A        95412 non-null  object
 7   GEOCODE2      95412 non-null  object
 8   DOMAIN_A      95412 non-null  object
 9   DOMAIN_B      95412 non-null  int64 
 10  ODATEW_YR     95412 non-null  int64 
 11  ODATEW_MM     95412 non-null  int64 
 12  DOB_YR        95412 non-null  int64 
 13  DOB_MM        95412 non-null  int64 
 14  MINRDATE_YR   95412 non-null  int64 
 15  MINRDATE_MM   95412 non-null  int64 
 16  MAXRDATE_YR   95412 non-null  int64 
 17  MAXRDATE_MM   95412 non-null  int64 
 18  LASTDATE_YR   95412 non-null  int64 
 19  LAST

In [4]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [5]:
# checking for imbalance
target['TARGET_B'].value_counts(normalize=True)

0    0.949241
1    0.050759
Name: TARGET_B, dtype: float64

In [6]:
# 95% of the answers are 0 on the feature TARGET_B, so we'll have to keep this in mind

In [7]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 315 entries, TCODE to CLUSTER2
dtypes: float64(9), int64(306)
memory usage: 229.3 MB


In [8]:
# looking for NA since its numerical data
numerical.isna().sum()

TCODE       0
AGE         0
INCOME      0
WEALTH1     0
HIT         0
           ..
AVGGIFT     0
CONTROLN    0
HPHONE_D    0
RFA_2F      0
CLUSTER2    0
Length: 315, dtype: int64

# Data Cleaning & Preprocessing

In [9]:
# lets drop the TARGET_D feature since its one level deeper into the information we want

target = target.drop(['TARGET_D'], axis = 1)

In [10]:
# dropping a feature that has the same value of every data point
categorical = categorical.drop(['RFA_2R'], axis = 1)

In [11]:
df = pd.concat([numerical,categorical,target], axis = 1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Columns: 337 entries, TCODE to TARGET_B
dtypes: float64(9), int64(322), object(6)
memory usage: 245.3+ MB


In [13]:
# X-y split
X = df.drop(['TARGET_B'],axis = 1)
y = pd.DataFrame(data=df, columns=['TARGET_B'])

In [14]:
X = pd.get_dummies(X, drop_first = True)

In [15]:
# Train Test Split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, shuffle = True, stratify = y)

# Modeling

In [16]:
# since we have seen above that the data is completely imbalanced (95%/5%) we'll directly implement oversampling techniques

In [17]:
# implementing SMOTE and calling the classifier


smote = SMOTE(k_neighbors = 10)
X_train_SMOTE,y_train_SMOTE = smote.fit_resample(X_train, y_train)

rfc = RandomForestClassifier(random_state = 10)

In [18]:
# function

def model(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train,y_train)
    pred_train = classifier.predict(X_train)
    pred_test = classifier.predict(X_test)
    print(classification_report(y_test, pred_test))

In [19]:
# let's compare the baseline model...

model(rfc, X_train, X_test, y_train, y_test)

  classifier.fit(X_train,y_train)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18114
           1       0.00      0.00      0.00       969

    accuracy                           0.95     19083
   macro avg       0.47      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



In [20]:
#... with the balanced data
model(rfc, X_train_SMOTE, X_test, y_train_SMOTE, y_test)

  classifier.fit(X_train,y_train)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18114
           1       0.04      0.00      0.01       969

    accuracy                           0.95     19083
   macro avg       0.50      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



##### we can see that the imbalanced present a really good accuracy but thats mainly because 95% of the target variable is 0, and it actually performs really bad the 1 values.
##### sadly, the oversampling does not solve this problem either
