In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

data = pd.concat([categorical,numerical,target],axis=1)


In [22]:
data.shape


(95412, 339)

In [4]:
target_d = data[data['TARGET_D']!=0]


In [5]:
#checking for NaNs
nulls = [column for column in data if data[column].isna().sum() > 0]
nulls       


[]

In [6]:
data['TARGET_B'].value_counts()


0    90569
1     4843
Name: TARGET_B, dtype: int64

In [7]:
#balacning data - upsampling
no_donate = data[data['TARGET_B']==0]
yes_donate = data[data['TARGET_B']==1]

from sklearn.utils import resample
yes_donate_oversampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),
                                    random_state=42)


display(no_donate.shape)
display(yes_donate_oversampled.shape)


(90569, 339)

(90569, 339)

In [8]:
oversampled = pd.concat([no_donate,yes_donate_oversampled])


In [9]:
#X-Y Split, train-test split

X = oversampled.drop(['TARGET_B','TARGET_D'],axis=1)
y = oversampled['TARGET_B']


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [11]:
#cat/num X-train
#make a split between the continuous and discrete variables as I don't want to scale or encode the discrete ones, I want to keep their current values
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(object)

#cat/num X-test
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(object)


from sklearn.preprocessing import StandardScaler
#Scale X-train
transformer = StandardScaler().fit(X_train_num)
X_scaled_train = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)

#Scale X-test
X_scaled_test = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)
X_scaled_test


Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,1.182838,0.626022,0.159428,0.531017,-0.029559,-1.320474,-0.311571,1.334569,-0.617088,0.963573,...,-0.329634,0.286884,0.237411,0.623210,-0.444501,0.537244,1.554770,-0.998298,0.829085,-1.159685
1,-0.000362,0.626022,0.159428,1.112893,-0.029559,-1.320474,-0.311571,0.960959,-0.099493,0.412726,...,-0.964663,0.832180,-0.167817,-0.092231,0.148719,0.218165,-0.954379,-0.998298,-0.949825,-0.947620
2,-1.392363,-0.973903,1.535152,-0.050858,-0.029559,0.994498,2.598269,-0.159871,1.194494,0.412726,...,-0.752987,0.286884,0.237411,0.623210,0.593634,0.386909,0.868392,-0.998298,-0.949825,-0.152375
3,0.138838,0.626022,1.535152,1.403831,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.963573,...,-0.964663,1.377477,0.034797,0.265490,0.148719,0.678375,1.082045,-0.998298,-0.949825,1.013985
4,-0.000362,-2.573828,0.159428,-0.050858,-0.029559,0.346306,-0.634886,-0.159871,0.676899,0.412726,...,-0.964663,0.286884,-0.127294,-0.020687,-1.186026,-0.057962,-1.699336,-0.998298,-0.949825,-1.053652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54337,1.113238,0.626022,0.159428,-1.505548,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.412726,...,2.633833,-0.258413,-0.370430,-0.807672,-0.444501,-0.677155,-1.219564,-0.998298,1.718541,-1.530799
54338,1.391639,0.626022,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.533481,0.935697,-0.138120,...,0.093718,-0.476531,-0.167817,-0.163775,2.224989,-0.426131,0.260573,-0.998298,-0.949825,1.491132
54339,1.391639,-0.973903,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.907091,-0.358291,-2.341506,...,0.305394,-0.694650,0.034797,-0.449952,0.297024,-0.173015,0.771088,-0.998298,-0.060370,1.491132
54340,1.252439,0.626022,0.159428,-1.505548,-0.029559,-0.301886,0.981691,-0.159871,0.935697,0.963573,...,0.728747,-0.258413,-0.137425,-0.038573,-0.444501,-0.266974,1.406762,1.001705,0.829085,1.491132


In [12]:
from sklearn.preprocessing import OneHotEncoder
#Create encoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

#encode train
encoded_train = encoder.transform(X_train_cat).toarray()
encoded_train = pd.DataFrame(encoded_train)

#encode test
encoded_test = encoder.transform(X_test_cat).toarray()
encoded_test = pd.DataFrame(encoded_test)


In [13]:
train_scaled = pd.concat([encoded_train,X_scaled_train],axis=1)
test_scaled = pd.concat([encoded_test,X_scaled_test],axis=1)
df = pd.concat([test_scaled,train_scaled],axis=0)


In [21]:
df.shape

(181138, 354)

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(train_scaled, y_train)
print(clf.score(train_scaled, y_train))
print(clf.score(test_scaled, y_test))




0.6133789709454557




0.6109270913841964


In [15]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, train_scaled, y_train, cv=10)
print(np.mean(cross_val_scores))
print(cross_val_scores)




0.6108631619187992
[0.60473186 0.61056782 0.60607256 0.6115142  0.61742902 0.60938486
 0.60864421 0.60998501 0.61448064 0.61582144]


In [16]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix


In [17]:
clf = RandomForestClassifier(max_depth=10,
                             min_samples_split=20,
                             min_samples_leaf =20).fit(train_scaled, y_train)

pred_RF = clf.predict(test_scaled)

print('accuracy:', accuracy_score(y_test, pred_RF))
print("precision: ",precision_score(y_test,pred_RF))
print("recall: ",recall_score(y_test,pred_RF))
print("f1: ",f1_score(y_test,pred_RF))




accuracy: 0.8056383644326672
precision:  0.7932486514703323
recall:  0.8314426408900237
f1:  0.8118967052537845


In [18]:
#create a column with the prediction

In [89]:
numerical = data.select_dtypes(include = np.number)
numerical = numerical.drop(['TARGET_B','TARGET_D'], axis = 1)
#cat/num 
categorical = data.select_dtypes(include = object)
display(numerical.shape,categorical.shape)

(95412, 330)

(95412, 7)

In [90]:
from sklearn.preprocessing import StandardScaler

#Scale X-
# transformer = StandardScaler().fit(X_num)
# X_num_scaled = pd.DataFrame(transformer.transform(X_num),columns=X_num.columns)
# X_num_scaled = X_num_scaled.reset_index().drop(['index'], axis=1)
num_scaled = pd.DataFrame(transformer.transform(numerical), columns = numerical.columns, index = numerical.index)

In [92]:
from sklearn.preprocessing import OneHotEncoder

# #Create encoder
# encoder = OneHotEncoder(drop='first').fit(X_cat)

# #encode 
# encoded_X = encoder.transform(X_cat).toarray()
# X_cat_encoded = pd.DataFrame(encoded_X)
# X_cat_encoded = X_cat_encoded.reset_index().drop(['index'], axis=1)
encoded = pd.DataFrame(encoder.transform(categorical).toarray(), index=categorical.index)

In [93]:
X_all = pd.concat([encoded, num_scaled], axis=1)

In [94]:
X_all.shape
# nulls = [column for column in data if data_treated[column].isna().sum() > 0]
# nulls       


(95412, 354)

In [95]:
y_pred = clf.predict(X_all)
X_all['B_predictions'] = y_pred
X_all



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,B_predictions
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.333970,-0.319128,-0.523992,-0.538568,-0.520509,-0.004760,-1.001238,1.948226,0.398135,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.235235,0.199469,0.550771,1.261601,0.215310,0.954282,-1.001238,0.083847,-1.628365,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.675493,-0.159560,-0.882247,0.490100,-0.544692,-1.459730,0.998764,1.948226,1.518043,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.675493,-0.359021,-0.523992,0.104350,-0.606808,1.388782,0.998764,1.948226,0.504793,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.561652,-0.199452,-0.165738,0.747267,-0.601946,-1.603822,0.998764,0.083847,-0.295141,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.942849,0.199469,0.550771,0.104350,1.081920,1.606058,-1.001238,-0.848342,-1.041747,0
95408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.373644,0.000008,0.192517,0.104350,0.617665,0.487079,0.998764,-0.848342,-1.575036,0
95409,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.561652,-0.398913,-0.523992,-0.667151,-0.470019,1.697820,0.998764,1.016037,0.131490,0
95410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.333970,0.039900,0.049215,-0.538568,-0.111555,-1.647577,0.998764,1.948226,-1.095076,1


In [96]:
X_all.to_csv("Full_data.csv", index=False)
