# Something to try some models out

In [None]:
def moneycalc(confusion_matrix):
    '''
    This function takes a confusion matrix and calculates the cost of the sent mail
    then it substracts it from the estimated revenue of the customers.
    Cost of mail: 68 cents
    Average donation: 15.62 $
    '''
    # Cost = 68 cents times all positive predicted, we will send the mail there
    cost = 0.68 * (confusion_matrix[0][1]+confusion_matrix[1][1])
    # Revenue = 15.62 * True positives
    rev = 15.62 * confusion_matrix[1][1]
    return rev-cost

In [None]:
def model_test(X_train,X_test,y_train,y_test):
    # Creating a list of different models
    modellist = [LogisticRegression(random_state=0, solver='sag'),
                 DecisionTreeClassifier(max_depth=2),
                 neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance'),
                 RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             n_jobs = -1)  ]
    modelnames = ['Logistic Regression','DecisionTree','KNN','RandomForest']
    #Evaluating each model
    for i in range(len(modellist)):
        model = modellist[i]
        # Fitting
        model.fit(X_train, y_train)
        # Predicting
        predictions = model.predict(X_test)
        # Calculating confusion matrix
        cm = confusion_matrix(y_test, predictions)
        print(cm)
        # Calculating profit if this prediction would have been apllied
        print('Profit: '+str(moneycalc(cm))+ ' $')
        # Printing different evaluation metrics
        print(modelnames[i] + " score: ", model.score(X_test, y_test))
        print(modelnames[i] + "precision: ",precision_score(y_test,predictions))
        print(modelnames[i] + "recall: ",recall_score(y_test,predictions))
        print(modelnames[i] + "f1: ",f1_score(y_test,predictions))

# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score

# Reading the files

In [None]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
target = pd.read_csv('files_for_lab/target.csv')

In [None]:
numerical.head(4)

In [None]:
categorical.head(4)

#### Quickly calculating mean donation

In [None]:
np.mean(target[target['TARGET_B']==1]['TARGET_D'])

# Ordinal categorical data

It seem, that some of the values in the numerical df are in fact categorical.
Nevertheless, I will leave them in the numerical dataframe, they are already represented as numbers, so there is no ordinal encoding neccessary.
In addition they will get scaled this way and be in the same range as the rest of the data.

# Changing the datatype of the categoricals

In [None]:
# A lot of the categoricals are still encoded as numbers, we change that, since otherwise they will get 
# missasigned in the num-cat-split

# for i in categorical.columns:
#     print(categorical[i].dtypes)


# We only leave DOMAIN_B out, since they are already numbers in the right order and we would later ordinal encode them,
# we will just assign them to the numerical dataframe, following the same logic as the other ordinal values.
numerical['DOMAIN_B'] = categorical['DOMAIN_B']
categorical =categorical.drop('DOMAIN_B', axis = 1)



for col in categorical.columns:
    categorical[col] = categorical[col].apply(lambda x: str(x))

In [None]:
# Now they are all objects

# for i in categorical.columns:
#     print(categorical[i].dtypes)


# X-Y Split

In [None]:
df_all = pd.concat([numerical,categorical], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_all, target, test_size=0.2)

In [None]:
y_test_all = y_test.copy()

In [None]:
y_test = y_test.drop('TARGET_D', axis = 1)

# Num-Cat Split

In [None]:
train_num  = X_train.select_dtypes(include = np.number)
train_cat = X_train.select_dtypes(include = object)

test_num  = X_test.select_dtypes(include = np.number)
test_cat = X_test.select_dtypes(include = object)

# Encoding and scaling

## Scaling the numericals

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Fitting scaler
transformer = MinMaxScaler().fit(train_num)
# Scaling train and test data
train_num_scaled = pd.DataFrame(transformer.transform(train_num), columns = train_num.columns, index = train_num.index)
test_num_scaled = pd.DataFrame(transformer.transform(test_num), columns = test_num.columns, index = test_num.index)

## OneHot encoding categoricals

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Fit encoder
encoder = OneHotEncoder(handle_unknown='ignore').fit(train_cat)
# Getting the column names for the later selection
column_name = encoder.get_feature_names_out(train_cat.columns)
# Encode train and test
train_encoded = pd.DataFrame(encoder.transform(train_cat).toarray(),columns = column_name, index=train_cat.index)
test_encoded = pd.DataFrame(encoder.transform(test_cat).toarray(),columns = column_name, index=test_cat.index)

# Concatenating prepared data

In [None]:
train_all = pd.concat([train_encoded, train_num_scaled, y_train['TARGET_B']], axis = 1)
X_test = pd.concat([test_encoded,test_num_scaled], axis = 1)

# Oversampling

In [None]:
from sklearn.utils import resample
# Splitting into majority and minority class, the 'yes' donors are the minority class.
no = train_all[train_all['TARGET_B']==0]
yes = train_all[train_all['TARGET_B']==1]

In [None]:
# oversample minority
yes_oversampled = resample(yes, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no),#<- make both sets the same size
                                    )

In [None]:
train_oversampled = pd.concat([no,yes_oversampled],axis=0)
train_oversampled.head(2)

In [None]:
X_train_over = train_oversampled.drop('TARGET_B', axis = 1)
y_train_over = train_oversampled['TARGET_B']

# Testing Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
# Defining the classifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2 )

In [None]:
# Training the classifier
clf.fit(X_train_over, y_train_over)
# Making predictions for outcome
y_pred = clf.predict(X_test)

In [None]:
display(confusion_matrix(y_test, y_pred))
print(clf.score(X_test, y_test))

The result shows, that we are able to predict over half of the donors, but to achieve this, we send out a lot of letters, we will definately have to improve that.

# Feature selection

#### RFE

# -----------------------------------------------------------------------------------
### Somesthing seems to went wrong when saving the pickle while trying to run this over night
I can't get it to work again, will try something else and revisit this part later.

In the last lab I tried PCA, Kbest and Variance selection.
This time I will try with recursive feature elimination. The only obstacle might be, that 
with the number of columns it will tale long.

In [None]:
# from sklearn.feature_selection import RFE

In [None]:
# lm = linear_model.LogisticRegression()
# rfe = RFE(lm, n_features_to_select=30, verbose=False)

In [None]:
# %%time
# import warnings
# warnings.filterwarnings('ignore')
# rfe.fit(X_train_over, y_train_over)

In [None]:
# import pickle

In [None]:
# This takes 1h 24min when opening the notebook we will continue with the pickled version!
# pickle.dump(rfe, open('rec_feat_elim.p', 'wb'))

In [None]:
# We take the pickled transformer
# rfe = pickle.load(open('rec_feat_elim.p','rb'))

In [None]:
# rfe.ranking_

In [None]:
# # A lot of 'cluster' columns are in the resulting dataframe
# df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
# df['Column_name'] = pd.DataFrame(X_train_over).columns
# df[df['Rank']==1].head(5)

In [None]:
# # Transforming the data and trying out models:
# X_train_rfe = rfe.transform(X_train_over)
# X_test_rfe = rfe.transform(X_test)

In [None]:
# # We try out the randeom forest with cross validate:
# clf.fit(X_train_rfe, y_train_over)
# results = cross_validate(model,X_test_rfe, y_test, cv = 5)
# results

# -----------------------------------------------------------------------------------

#### Kbest

In [None]:
from sklearn.feature_selection import SelectKBest , chi2
from sklearn.feature_selection import chi2

In [None]:
# First we do a quick transformation and see if we achieve anything this way
model = SelectKBest(chi2, k=35).fit(X_train_over, y_train_over)
XTr_temp = pd.DataFrame(model.transform(X_train_over), index = X_train_over.index)
Xte_temp = pd.DataFrame(model.transform(X_test), index = X_test.index)

In [None]:
model_test(XTr_temp,Xte_temp,y_train_over, y_test)

#### Multicollinearity reduction
We will have to eliminate multicollinearity and repeat the process

In [None]:
# We reuse the function from yesterday:

In [None]:
def corr_check(model, df_input, number_of_columns):
    # Making a dataframe of the scores and column names
    df = pd.DataFrame(data = model.scores_, columns = ['score'])
    df['Column'] = df_input.columns
    # Sort it
    df_sorted = df.sort_values(by = ['score'], ascending = False).reset_index()
    # Making a list of the first x columns
    collist = []
    for i in range(number_of_columns):
        collist.append(df_sorted['Column'][i])
    # creating a correlation matrix
    correlations_matrix = df_input[collist].corr()
    correlations_matrix
    # create a heatmap of it
    plt.figure(figsize = (16,16))
    sns.heatmap(correlations_matrix, annot=True, fmt='.2f')
    plt.show()
    return collist

In [None]:
collist = corr_check(model,X_train_over, 33)

In [None]:
# The dataframe is really long.
# To eliminate multicollinearity, I would look for high correlation, 
# keep the first column(with the highest score) and eliminate those,
# with a too high correlation with it.
# I will try to put this into a function

In [None]:
def multicor_elim(df):
    '''
    Iterating over the whole dataframe and eliminating multicollinearity by hand is tedious and not very effective
    This function will iterate over the upper triangle of a correlation matrix and list the columns with a correlation
    over a given threshhold.
    '''
    droplist = []
    for row in range(len(df)):
        # If the selected columns already is part of the droplist, further colinearity is not importand and we don't want
        # to needlessly eliminate columns
        if df.columns[row] in droplist:
            pass
        
        # Since we iterate over the columns, starting with the row number, we just iterate over the upper triangle.
        for col in range(row,len(df.columns)): 
            # We skip the comparison if we are in the diagonal. Otherwise we would eliminate all columns.
            if row == col:
                 pass
            # We dont want to add columns multiple times
            elif df.columns[col] in droplist:
                pass
            # Finally we can check for multicollinearity
            
    

In [None]:
print(collist)

In [None]:
# Problematic columns to remove:
droplist = ['FIRSTDATE_YR_96','LASTDATE_YR_96','HVP1','FIRSTDATE_YR_86','HVP1','HVP3','HVP6','HVP4']

In [None]:
# We drop the columns
X_train_over = X_train_over.drop(droplist,axis=1)
# We have to do the same for the test data
X_test = X_test.drop(droplist,axis=1)

In [None]:
# Try again
model = SelectKBest(chi2, k=35).fit(X_train_over, y_train_over)
XTr_temp = pd.DataFrame(model.transform(X_train_over), index = X_train_over.index)
Xte_temp = pd.DataFrame(model.transform(X_test), index = X_test.index)
model_test(XTr_temp,Xte_temp,y_train_over, y_test)
collist = corr_check(model,X_train_over, 30)

In [None]:
# Problematic columns to remove:
droplist = ['STATE_CA','RP1']
# We drop the columns
X_train_over = X_train_over.drop(droplist,axis=1)
# We have to do the same for the test data
X_test = X_test.drop(droplist,axis=1)

In [None]:
# Try again
model = SelectKBest(chi2, k=35).fit(X_train_over, y_train_over)
XTr_temp = pd.DataFrame(model.transform(X_train_over), index = X_train_over.index)
Xte_temp = pd.DataFrame(model.transform(X_test), index = X_test.index)
model_test(XTr_temp,Xte_temp,y_train_over, y_test)
collist = corr_check(model,X_train_over, 30)

In [None]:
# Problematic columns to remove:
droplist = ['ODATEW_YR_86','ODATEW_YR_88']
# We drop the columns
X_train_over = X_train_over.drop(droplist,axis=1)
# We have to do the same for the test data
X_test = X_test.drop(droplist,axis=1)

In [None]:
print(collist)

In [None]:
# Try again
model = SelectKBest(chi2, k=35).fit(X_train_over, y_train_over)
XTr_temp = pd.DataFrame(model.transform(X_train_over), index = X_train_over.index)
Xte_temp = pd.DataFrame(model.transform(X_test), index = X_test.index)
model_test(XTr_temp,Xte_temp,y_train_over, y_test)
collist = corr_check(model,X_train_over, 30)

As we can see, we have no more concerning multicollinearity in here and the amount of money our action would bring in got up quite a bit.

In [None]:
# These features look good, we will create a dataframe with jzust them and then move on:
X__train_selected = X_train_over[collist].copy()
X__test_selected = X_test[collist].copy()

# Model pipeline
We got the best results

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
# Defining the different models
model1 = LogisticRegression(random_state=0, solver='sag')
model2 = DecisionTreeClassifier(max_depth=2)
model3 = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2 )
model4 = neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance')

In [None]:
# Iterating over each model, apllying it with cross validation and returning the results for comparison.
import warnings
warnings.filterwarnings('ignore')
model_pipeline = [model1, model2, model3, model4]
model_names = ['Logistic Regressor', 'Decision Tree', 'Random Forest', 'KNN Classifier']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X__test_selected, y_test, cv=10, scoring='recall'))
    scores[model_name] = mean_score
print(scores)

Well as we know this still looks bad, hopefully the hyperparameter search will yield better results.

# Hyperparameter search

#### Random search

In [None]:
from sklearn.model_selection import GridSearchCV

We try to improve our Random Forest model with Hyperparameter tuning, since we had the best results with the model.

In [None]:
# RandomForestClassifier(
#     n_estimators=100,
#     *,
#     criterion='gini',
#     max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     min_weight_fraction_leaf=0.0,
#     max_features='sqrt',
#     max_leaf_nodes=None,
#     min_impurity_decrease=0.0,
#     bootstrap=True,
#     oob_score=False,
#     n_jobs=None,
#     random_state=None,
#     verbose=0,
#     warm_start=False,
#     class_weight=None,
#     ccp_alpha=0.0,
#     max_samples=None,
# )

These are the possible parameters for the random forest, I will do some research and find useful values to get into the search.

In [None]:
# Defining what we want to try out
grid = {
            'max_depth': [5, 10, None],
            'min_samples_split' : [2,5,100],
            'n_jobs' : [-1],
            'max_features' : ['sqrt',30],
            'class_weight': [None,'balanced']
            }

In [None]:
# Choosing random forest
model = RandomForestClassifier()

In [None]:
X__test_selected.shape

In [None]:
# %%time
# grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5, n_jobs = -1)
# grid_search.fit(X__test_selected, y_test)

In [None]:
# grid_search.best_params_

The grid search took quite long, so I put the results here:

In [None]:
rf = RandomForestClassifier(
 class_weight= None,
 criterion= 'gini',
 max_depth= 5,
 max_features= 'sqrt',
 min_samples_split= 2,
 n_jobs= -1
)

In [None]:
rf.fit(X__train_selected, y_train_over)
predictions = rf.predict(X__test_selected)
print(confusion_matrix(y_test, predictions))
print(moneycalc(confusion_matrix(y_test, predictions)))
print("Random Forest score: ", rf.score(X__test_selected, y_test))
print("Random Forest precision: ",precision_score(y_test,predictions))
print("Random Forest recall: ",recall_score(y_test,predictions))
print("Random Fores f1: ",f1_score(y_test,predictions))

We get a relatively good amount of money on this case.

# Additional tweaks

In [None]:
# I will try to improve the reults by altering the probability threshhold:

In [None]:
def probpred(probabilities, threshhold):
    predictions = []
    # We create our own predictions list, if the probability is bigger than the threshhold,
    # we set the value in our list.
    for p in probabilities:
        if p[1] >= threshhold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [None]:
predictions = probpred(rf.predict_proba(X__test_selected), .47)

In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)
print('Profit: '+str(moneycalc(cm))+ ' $')
print("Random Forest score: ", rf.score(X__test_selected, y_test))
print("Random Forest precision: ",precision_score(y_test,predictions))
print("Random Forest recall: ",recall_score(y_test,predictions))
print("Random Fores f1: ",f1_score(y_test,predictions))

I get the best value for how much money the mailing action would make if I lower the probability
threshhold to 47%

# Predictions for the dataset

#### Rebuilding the original dataframe, but already encoded

In [None]:
# Rebuilding the encoded dataframe
df_1 = pd.concat([train_encoded, train_num_scaled, y_train], axis = 1)
df_2 = pd.concat([test_encoded,test_num_scaled, y_test_all], axis = 1)
print(df_1.shape)
print(df_2.shape)

In [None]:
df = pd.concat([df_1,df_2], axis = 0)
print(df.shape)

#### Making predictions:

In [None]:
predictions = probpred(rf.predict_proba(df[collist]), .47)
df['Predicted_B'] = predictions

In [None]:
df.shape

#### Saving

In [None]:
Takes quite some time, therefore commented out
# df.to_csv('encoded_predicted.csv', index=False)

# Conclusion

In [None]:
# Cost for all mailings:
Cost = len(df[df['Predicted_B']==1])*0.68
Cost

In [None]:
# Revenue
Revenue = len( df[ (df['Predicted_B']==1) & (df['TARGET_B']==1)] ) * 15.62
Revenue 

In [None]:
Revenue - Cost

Following the predictions the profit on our mailing action would be 19099 USD, 
this is already a substantial improvement in comparison to the about 11000 USD from before.