# Lab Random Forest

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

## Importing data and cleaning

In [2]:
numerical = pd.read_csv('./files_for_lab/numerical.csv')
categorical = pd.read_csv('./files_for_lab/categorical.csv')
targets = pd.read_csv('./files_for_lab/target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [3]:
category_0 = data[data['TARGET_B']==0]
category_1 = data[data['TARGET_B']== 1 ]

In [4]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [5]:
data = pd.concat([category_0, category_1_oversampled], axis = 0)
data = data.sample(frac =1) #randomize the rows
data = data.reset_index(drop=True)
print(data.shape)

(181138, 339)


In [6]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we onehot encode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(np.object)


In [7]:
X = pd.concat([numericalX, encoded_categorical], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [8]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

## Creating the model

In [12]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.6166310123524946
0.6125924699127746


In [13]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.6118142295217721


### False positive vs False negative

In [None]:
## In my opinion a false positive is worse because i prefer having the exact number of real positives, because 
## knowing that i can make an idea between benefits and expenses, and if i have false negatives better, but if i haven't at least i have the real data.

## Trying to improve the model deleting some numerical columns

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numericalX)

In [22]:
from sklearn.feature_selection import VarianceThreshold 
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

In [23]:
sel = sel.fit(numerical_scaled)
temp = sel.transform(numerical_scaled)
temp = pd.DataFrame(temp)
temp.shape

(181138, 90)

In [28]:
X_impr = pd.concat([temp, encoded_categorical,data['TARGET_D']], axis = 1)

In [29]:
X_train_impr, X_test_impr, y_train_impr, y_test_impr = train_test_split(X_impr, y, test_size=0.2, random_state=0)

In [30]:
X_train_impr = pd.DataFrame(X_train_impr)
X_test_impr = pd.DataFrame(X_test_impr)

y_train_regression_impr = X_train_impr['TARGET_D']
y_test_regression_impr = X_test_impr['TARGET_D']

# Now we can remove the column target d from the set of features
X_train_impr = X_train_impr.drop(['TARGET_D'], axis = 1)
X_test_impr = X_test_impr.drop(['TARGET_D'], axis = 1)

In [35]:
clf_impr = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf_impr.fit(X_train_impr, y_train_impr)
print(clf_impr.score(X_train_impr, y_train_impr))
print(clf_impr.score(X_test_impr, y_test_impr))

0.6129942723069491
0.6082864083029701


## Trying to predict the amount of money

In [36]:
list_1 = []

In [38]:
reset = y_test_impr.reset_index()

In [39]:
reset

Unnamed: 0,index,TARGET_B
0,63554,1
1,93091,1
2,85771,1
3,45065,0
4,77682,1
...,...,...
36223,52227,0
36224,29914,1
36225,145073,0
36226,161935,1


In [40]:
for i in range(len(reset['index'])):
    if reset['TARGET_B'][i] == 1:
        list_1.append(reset['index'][i])

In [43]:
len(list_1)

18057

In [42]:
newX = X_impr.iloc[list_1]
newX

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,TARGET_D
63554,0.463918,0.500000,0.666667,0.121212,0.454545,1.000000,0.000000,0.000000,1.000000,0.898990,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,25.0
93091,0.628866,0.500000,0.888889,0.313131,0.000000,0.666667,1.000000,0.000000,0.000000,0.858586,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0
85771,0.608247,0.333333,0.222222,0.434343,0.575758,0.666667,0.525253,0.000000,0.484848,0.949495,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,25.0
77682,0.402062,0.666667,1.000000,0.353535,0.141414,1.000000,0.000000,0.737374,0.272727,0.858586,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0
108573,0.639175,1.000000,1.000000,0.373737,0.424242,1.000000,0.383838,0.000000,0.626263,0.959596,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28190,0.742268,0.666667,1.000000,0.393939,0.363636,1.000000,1.000000,0.000000,0.000000,0.878788,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
100498,0.624862,0.666667,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0.888889,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0
29914,0.577320,1.000000,1.000000,0.575758,0.060606,0.888889,1.000000,0.000000,0.000000,0.949495,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,25.0
161935,0.624862,0.333333,0.666667,0.414141,0.373737,0.666667,0.000000,0.000000,1.000000,0.989899,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0


In [49]:
newy = newX['TARGET_D']
newX = newX.drop(['TARGET_D'], axis= 1)

In [50]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(newX, newy, test_size=0.2, random_state=0)

In [53]:
from sklearn.ensemble import RandomForestRegressor
clf_new = RandomForestRegressor(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf_new.fit(X_train_new, y_train_new)
print(clf_new.score(X_train_new, y_train_new))
print(clf_new.score(X_test_new, y_test_new))

0.5503409581933042
0.5165226197360053


In [54]:
y_test_new

7494      30.0
138221    25.0
174132    15.0
167552    50.0
173463    15.0
          ... 
24612     25.0
102049    10.0
40344     20.0
146439    50.0
156447     5.0
Name: TARGET_D, Length: 3612, dtype: float64

In [55]:
y_train_new

115720    14.0
169947    20.0
39781     21.0
136993     3.0
37187     23.0
          ... 
88940     10.0
120068    27.0
116312    10.0
160462    15.0
174043    15.0
Name: TARGET_D, Length: 14445, dtype: float64