In [4]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, confusion_matrix

In [5]:
# datasets

categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

data = pd.concat([categorical,numerical,target],axis=1)

In [6]:
print(data.shape)
data.head()

(95412, 339)


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.8125,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0


In [7]:
target_d = data[data['TARGET_D']!=0]

In [8]:
# checking for NaNs

cols_with_nans = [col for col in data if data[col].isna().sum() > 0]
print("Columns with NaN values:", cols_with_nans)

Columns with NaN values: []


In [9]:
data['TARGET_B'].value_counts()

TARGET_B
0    90569
1     4843
Name: count, dtype: int64

In [10]:
# balacning data (upsampling)

no_donation = data[data['TARGET_B']==0]
donation = data[data['TARGET_B']==1]

# upsampling minority class
donation_oversampled = resample(donation, replace=True, n_samples = len(no_donation), random_state=42)

display(no_donation.shape)
display(donation_oversampled.shape)

(90569, 339)

(90569, 339)

In [11]:
# concatenating oversampled data

oversampled = pd.concat([no_donation,donation_oversampled])

In [12]:
# X-Y Split, train-test split

X = oversampled.drop(['TARGET_B','TARGET_D'],axis=1)
y = oversampled['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
# separating numerical and categorical features 

X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(object)

X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(object)


# standard scaling numerical features

transformer = StandardScaler().fit(X_train_num)
X_scaled_train = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)

X_scaled_test = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)

In [14]:
X_scaled_test

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,1.182838,0.626022,0.159428,0.531017,-0.029559,-1.320474,-0.311571,1.334569,-0.617088,0.963573,...,-0.329634,0.286884,0.237411,0.623210,-0.444501,0.537244,1.554770,-0.998298,0.829085,-1.159685
1,-0.000362,0.626022,0.159428,1.112893,-0.029559,-1.320474,-0.311571,0.960959,-0.099493,0.412726,...,-0.964663,0.832180,-0.167817,-0.092231,0.148719,0.218165,-0.954379,-0.998298,-0.949825,-0.947620
2,-1.392363,-0.973903,1.535152,-0.050858,-0.029559,0.994498,2.598269,-0.159871,1.194494,0.412726,...,-0.752987,0.286884,0.237411,0.623210,0.593634,0.386909,0.868392,-0.998298,-0.949825,-0.152375
3,0.138838,0.626022,1.535152,1.403831,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.963573,...,-0.964663,1.377477,0.034797,0.265490,0.148719,0.678375,1.082045,-0.998298,-0.949825,1.013985
4,-0.000362,-2.573828,0.159428,-0.050858,-0.029559,0.346306,-0.634886,-0.159871,0.676899,0.412726,...,-0.964663,0.286884,-0.127294,-0.020687,-1.186026,-0.057962,-1.699336,-0.998298,-0.949825,-1.053652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54337,1.113238,0.626022,0.159428,-1.505548,-0.029559,-1.320474,-0.311571,1.334569,-1.393481,0.412726,...,2.633833,-0.258413,-0.370430,-0.807672,-0.444501,-0.677155,-1.219564,-0.998298,1.718541,-1.530799
54338,1.391639,0.626022,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.533481,0.935697,-0.138120,...,0.093718,-0.476531,-0.167817,-0.163775,2.224989,-0.426131,0.260573,-0.998298,-0.949825,1.491132
54339,1.391639,-0.973903,0.159428,-0.923672,-0.029559,-1.320474,-0.311571,-0.907091,-0.358291,-2.341506,...,0.305394,-0.694650,0.034797,-0.449952,0.297024,-0.173015,0.771088,-0.998298,-0.060370,1.491132
54340,1.252439,0.626022,0.159428,-1.505548,-0.029559,-0.301886,0.981691,-0.159871,0.935697,0.963573,...,0.728747,-0.258413,-0.137425,-0.038573,-0.444501,-0.266974,1.406762,1.001705,0.829085,1.491132


In [17]:
# One-hot encoding categorical features

encoder = OneHotEncoder(drop='first').fit(X_train_cat)

encoded_train = encoder.transform(X_train_cat).toarray()
encoded_train = pd.DataFrame(encoded_train)

encoded_test = encoder.transform(X_test_cat).toarray()
encoded_test = pd.DataFrame(encoded_test)

#encoded_train = encoder.transform(X_train_cat).toarray()
#encoded_test = encoder.transform(X_test_cat).toarray()

In [18]:
# concatenating encoded and scaled features

train_scaled = pd.concat([encoded_train,X_scaled_train],axis=1)

test_scaled = pd.concat([encoded_test,X_scaled_test],axis=1)

In [20]:
train_scaled.columns = train_scaled.columns.astype(str)
test_scaled.columns = test_scaled.columns.astype(str)

In [21]:
# PCA (dimensionality reduction)

n_components = 50
pca = PCA(n_components=n_components)

train_reduced = pca.fit_transform(train_scaled)
test_reduced = pca.transform(test_scaled)

In [22]:
# RandomForestClassifier on reduced features

clf = RandomForestClassifier(max_depth=5, min_samples_split=20, min_samples_leaf=20)

clf.fit(train_reduced, y_train)

In [23]:
print(clf.score(train_reduced, y_train))
print(clf.score(test_reduced, y_test))

0.6297201804473327
0.6154723786389901


In [24]:
# Cross-validation

cross_val_scores = cross_val_score(clf, train_scaled, y_train, cv=10)

print("Cross-validation Accuracy:", np.mean(cross_val_scores))
print("Cross-validation Scores:", cross_val_scores)

Cross-validation Accuracy: 0.6104609649130446
Cross-validation Scores: [0.60433754 0.6079653  0.60946372 0.60772871 0.61632492 0.60843849
 0.6097484  0.6129821  0.61384967 0.6137708 ]


In [25]:
# precision, recall and F1-score

pred_RF = clf.predict(test_scaled)

print('Test Accuracy:', accuracy_score(y_test, pred_RF))
print("Precision:", precision_score(y_test, pred_RF))
print("Recall:", recall_score(y_test, pred_RF))
print("F1 Score:", f1_score(y_test, pred_RF))



ValueError: X has 354 features, but RandomForestClassifier is expecting 50 features as input.

In [None]:
# confusion matrix

cm = confusion_matrix(y_test, pred_RF)

print("\nConfusion Matrix:\n")
print(cm)

- True Negative: 21013 - correctly predicted instances where individuals did not make a donation.

- False Positive: 5914 - incorrectly predicted instances as positive (donation made), but in reality, no donation was made.

- False Negative: 4770 - incorrectly predicted instances as negative (no donation), but in reality, a donation was made.

- True Positive: 22645 - correctly predicted instances where individuals made a donation.