In [2]:
# Importing libraries

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt 
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from sklearn.tree import DecisionTreeClassifier

In [8]:
# Reading the datasets and contacting the tree of them

numerical = pd.read_csv(r"numerical.csv")
targets = pd.read_csv(r"target.csv")
categorical = pd.read_csv(r"categorical.csv")

df = pd.concat([numerical, categorical, targets], axis = 1)
df

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


In [9]:
# Cleaning the dataset

def cleaning (data):
    cols = []
    for i in range (len(data.columns)):
        cols.append(data.columns[i].lower().replace(' ','_'))
    
    data.columns = cols

    return data

cleaning(df)

Unnamed: 0,tcode,age,income,wealth1,hit,malemili,malevet,vietvets,wwiivets,localgov,...,minrdate_yr,minrdate_mm,maxrdate_yr,maxrdate_mm,lastdate_yr,lastdate_mm,firstdate_yr,firstdate_mm,target_b,target_d
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


In [10]:
# Droping duplicates and NAs

df.drop_duplicates()
df = df.dropna()

In [11]:
# Preparing the date for the model

X = df.drop(['target_b', 'target_d'], axis = 1)
Y_b = df['target_b']
Y_d = df['target_d']

numericalX = X.select_dtypes('number').reset_index(drop = True)
categorcalX = X.select_dtypes('object').reset_index(drop = True)
categorcalX = categorcalX.drop(['state'], axis = 1)

# One Hot Enconding Categorical DS

categorcalX = pd.get_dummies(categorcalX[['homeownr', 'gender', 'rfa_2r', 'rfa_2a', 'geocode2', 'domain_a']], dtype = int)

X = pd.concat([numericalX, categorcalX], axis = 1)

In [12]:
# Train Test Split

X_train, X_test, Y_train_b, Y_test_b, Y_train_d, Y_test_d = train_test_split(X, Y_b,Y_d, test_size=0.2, random_state=0)
train = pd.concat([X_train.reset_index(drop=True), 
                   Y_train_b.reset_index(drop=True), 
                   Y_train_d.reset_index(drop=True)], axis=1)

In [13]:
# Otimization

'''
Identifying and selecting the independent variables (features)
that have a statistically significant impact on predicting the target variable.

It does this by fitting a logistic regression model and then extracting the variables
with p-values less than 0.05 from the model's summary statistics. 

These selected variables are considered important for making predictions
and can be used in further analysis or modeling.

'''

X_train.reset_index(drop=True, inplace=True)
Y_train_b.reset_index(drop=True, inplace=True)

def significant_features(X, Y):

    model = sm.Logit(Y_train_b, X_train).fit()
    summary = model.summary()

    significant_features = []
    table = summary.tables[1]
    headers = table.data[0]
    data = table.data[1:]

    p_value_index = headers.index('P>|z|') if 'P>|z|' in headers else headers.index('P>|z| [0.025')

    for row in data:
        p_value = float(row[p_value_index])
        if p_value < 0.05:
            significant_features.append(row[0])

    return significant_features

relevant_features = significant_features(X_train, Y_train_b)
print("Independent variables with P>|t| < 0.05:", relevant_features)

Optimization terminated successfully.
         Current function value: 0.192267
         Iterations 9
Independent variables with P>|t| < 0.05: ['age', 'income', 'wealth1', 'hhn3', 'hu3', 'hu4', 'rhp2', 'dma', 'ic11', 'tpe7', 'lfc2', 'lfc4', 'occ6', 'ec1', 'voc3', 'cardprom', 'numprm12', 'ngiftall', 'cardgift', 'timelag', 'rfa_2f', 'domain_b', 'odatew_mm', 'dob_mm', 'lastdate_yr', 'lastdate_mm', 'firstdate_yr']


In [14]:
X_train_adj = X_train[relevant_features]
X_test_adj = X_test[relevant_features]

model = LogisticRegression()
model.fit(X_train_adj, Y_train_b)

print(model.score(X_train_adj, Y_train_b))
print(model.score(X_test_adj, Y_test_b))

0.9496521636599458
0.9475973379447676


In [15]:
train = pd.concat([X_train_adj.reset_index(drop=True), 
                   Y_train_b.reset_index(drop=True), 
                   Y_train_d.reset_index(drop=True)], axis=1)
print(train['target_b'].value_counts())

0    72486
1     3843
Name: target_b, dtype: int64


In [16]:
# Undersampling

assert not train['target_d'].isnull().any(), "NaNs in original target_d"
category_0 = train[train['target_b'] == 0]
category_1 = train[train['target_b'] == 1]

category_0_undersampled = resample(category_0,
                                   replace=False, 
                                   n_samples=len(category_1),
                                   random_state=0)

# Combine undersampled and minority category
train_undersampled = pd.concat([category_0_undersampled, category_1], axis=0)

# Check for NaNs after concatenation
assert not train_undersampled['target_d'].isnull().any(), "NaNs introduced during resampling"

# Separating the features and target variables
Y_train_undersampled_b = train_undersampled['target_b']
Y_train_undersampled_d = train_undersampled['target_d']
X_train_undersampled = train_undersampled.drop(['target_b', 'target_d'], axis=1)

print(train_undersampled['target_b'].value_counts())

0    3843
1    3843
Name: target_b, dtype: int64


In [17]:
model = LogisticRegression()
model.fit(X_train_undersampled, Y_train_undersampled_b)

print(model.score(X_train_undersampled, Y_train_undersampled_b))
print(model.score(X_test_adj, Y_test_b))

0.5882123341139734
0.592097678562071


#####  Predicting the "target_d"

In [18]:
# Making predictions on the train dataset

predictions_target_b_train = model.predict(X_train_undersampled)

In [19]:
# Filtering the data for donors

X_train_d = X_train_undersampled[predictions_target_b_train == 1]
Y_train_d = Y_train_undersampled_d[predictions_target_b_train == 1]

In [20]:
# Training Regression Model for 'target_d' (linear regression)

model_d = sm.OLS(Y_train_d, X_train_d).fit()
model_d.summary()

0,1,2,3
Dep. Variable:,target_d,R-squared (uncentered):,0.387
Model:,OLS,Adj. R-squared (uncentered):,0.383
Method:,Least Squares,F-statistic:,86.51
Date:,"Sun, 04 Feb 2024",Prob (F-statistic):,0.0
Time:,20:07:44,Log-Likelihood:,-14016.0
No. Observations:,3720,AIC:,28090.0
Df Residuals:,3693,BIC:,28250.0
Df Model:,27,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,0.0040,0.013,0.299,0.765,-0.022,0.030
income,0.3642,0.133,2.731,0.006,0.103,0.626
wealth1,0.1709,0.075,2.291,0.022,0.025,0.317
hhn3,-0.0089,0.020,-0.437,0.662,-0.049,0.031
hu3,-0.0156,0.051,-0.306,0.759,-0.115,0.084
hu4,-0.0195,0.052,-0.376,0.707,-0.121,0.082
rhp2,-0.0166,0.032,-0.516,0.606,-0.080,0.046
dma,0.0030,0.002,1.632,0.103,-0.001,0.007
ic11,0.0168,0.041,0.413,0.679,-0.063,0.096

0,1,2,3
Omnibus:,3376.058,Durbin-Watson:,1.217
Prob(Omnibus):,0.0,Jarque-Bera (JB):,294370.356
Skew:,3.973,Prob(JB):,0.0
Kurtosis:,45.849,Cond. No.,3060.0


In [21]:
# Making predictions on the test set

predictions_target_b_test = model.predict(X_test_adj)

# Filtering the data for donors

X_test_d = X_test_adj[predictions_target_b_test == 1]
Y_test_d = Y_test_d[predictions_target_b_test == 1]

In [22]:
predictions_target_d = model_d.predict(X_test_d)

In [23]:
mse = mean_squared_error(Y_test_d, predictions_target_d)
mae = mean_absolute_error(Y_test_d, predictions_target_d)
print("R2 value is = ",round(r2_score(Y_test_d, predictions_target_d),2))
print("The mse of the model is = ", round(mse,2))
print("The root mse of the model is = ",round(np.sqrt(mse),2))
print("The mean absolute error of the model is = ",round(mae,2))

R2 value is =  -2.27
The mse of the model is =  83.19
The root mse of the model is =  9.12
The mean absolute error of the model is =  8.1


In [24]:
sum_all = sum(predictions_target_d)
print('The expected sum of the donations are', round(sum_all, 2))

The expected sum of the donations are 64200.69
