In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Reading the data
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

## Apply the Random Forests algorithm but this time only by upscaling the data.

In [3]:
# X-y-Split

X = pd.concat([categorical, numerical], axis=1)
y = target.drop(['TARGET_D'], axis = 1)

In [4]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out())
X = pd.concat([numericalX, encoded_categorical], axis = 1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(np.object)


In [5]:
# Train-test-Split before upsampling

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
# Upsampling

# To upsample, we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)


from sklearn.utils import resample

category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

trainset_up = pd.concat([category_0, category_1_oversampled], axis=0)

X_train = pd.DataFrame(trainset_up.drop(['TARGET_B'], axis=1))
y_train = pd.DataFrame(trainset_up['TARGET_B'])

In [7]:
# Applying Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

  clf.fit(X_train, y_train)


0.6238859917777226
0.6013729497458471


TARGET_B
0           18083
1            1000
dtype: int64

array([[10928,  7155],
       [  452,   548]], dtype=int64)

In [68]:
# # Commenting it out so it doesn't always run for ages

# # For cross validation
# from sklearn.model_selection import cross_val_score
# clf = RandomForestClassifier(max_depth=5,
#                              min_samples_split=20,
#                              min_samples_leaf =20)
# cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
# print(np.mean(cross_val_scores))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.6188987845350111


In [69]:
cross_val_scores

array([0.62243068, 0.62077528, 0.61171277, 0.62192178, 0.61647237,
       0.62102504, 0.61495482, 0.62330137, 0.62095606, 0.61543768])

## Use Feature Selections that you have learned in class to decide if you want to use all of the features (PCA, etc)

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical)

scaled_df = pd.DataFrame(numerical_scaled, columns=numerical.columns)

### RFE (Recursive Feature Selection) 

In [9]:
X = numerical_scaled
y = target['TARGET_B']

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=23, verbose=False)
rfe.fit(X, y)

RFE(estimator=LinearRegression(), n_features_to_select=23, verbose=False)

In [10]:
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(numerical).columns
df[df['Rank']==1]

Unnamed: 0,Rank,Column_name
13,1,POP901
15,1,POP903
16,1,POP90C1
17,1,POP90C2
18,1,POP90C3
74,1,DW1
76,1,DW3
77,1,DW4
78,1,DW5
122,1,HUPA1


In [11]:
selected_columns = df[df['Rank']==1]

In [12]:
keep_list = list(selected_columns['Column_name'])
keep_list

['POP901',
 'POP903',
 'POP90C1',
 'POP90C2',
 'POP90C3',
 'DW1',
 'DW3',
 'DW4',
 'DW5',
 'HUPA1',
 'HUPA2',
 'MC1',
 'MC2',
 'TPE3',
 'TPE4',
 'TPE5',
 'TPE6',
 'RAMNTALL',
 'NGIFTALL',
 'MINRAMNT',
 'MAXRAMNT',
 'LASTGIFT',
 'AVGGIFT']

In [19]:
numerical_scaled_rfe = scaled_df[['POP901',  'POP903',  'POP90C1',  'POP90C2',  'POP90C3',  'DW1',  'DW3',  'DW4',  'DW5',  'HUPA1',  'HUPA2',  'MC1',  'MC2',  'TPE3',  'TPE4',  'TPE5',  'TPE6',  'RAMNTALL',  'NGIFTALL',  'MINRAMNT',  'MAXRAMNT',  'LASTGIFT',  'AVGGIFT']]

## Upscaling Dataframe with less features

In [20]:
# X-y-Split

X = pd.concat([categorical, numerical_scaled_rfe], axis=1)
y = target.drop(['TARGET_D'], axis = 1)

In [21]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out())
X = pd.concat([numericalX, encoded_categorical], axis = 1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(np.object)


In [22]:
# Train-test-Split before upsampling

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
# Upsampling

# To upsample, we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)


from sklearn.utils import resample

category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

trainset_up = pd.concat([category_0, category_1_oversampled], axis=0)

X_train = pd.DataFrame(trainset_up.drop(['TARGET_B'], axis=1))
y_train = pd.DataFrame(trainset_up['TARGET_B'])

In [24]:
# Applying Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

  clf.fit(X_train, y_train)


0.5973498330712137
0.6309280511449982


TARGET_B
0           18083
1            1000
dtype: int64

array([[11530,  6553],
       [  490,   510]], dtype=int64)

In [25]:
# Commenting it out so it doesn't always run for ages

# For cross validation
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.5947631137082767


In [26]:
cross_val_scores

array([0.59642709, 0.59497862, 0.59605436, 0.59495068, 0.59019107,
       0.5948817 , 0.59826171, 0.59460578, 0.59281231, 0.59446782])

In [29]:
model1 = DecisionTreeClassifier()
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression()
model3 = RandomForestClassifier()

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25)

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Classifier', 'Logistic Regression', 'Random Forest Classifier']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


{'Decision Tree Classifier': 0.9703528825772632, 'Logistic Regression': 0.5749179020137388, 'Random Forest Classifier': 0.9999793060633234}


In [30]:
val_scores = {}

for model, model_name in zip(model_pipeline,model_names):
    model.fit(X_train, y_train)
    val_scores[model_name] = model.score(X_test,y_test)
print(val_scores)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  model.fit(X_train, y_train)


{'Decision Tree Classifier': 0.8970811717235235, 'Logistic Regression': 0.6119058848189488, 'Random Forest Classifier': 0.9475973379447676}
