## Introduction

Unhappy customers don't stick around. What's more, unhappy customers rarely voice their dissatisfaction before leaving.

Santander Bank is asking Kagglers to help them identify dissatisfied customers early in their relationship. Doing so would allow Santander to take proactive steps to improve a customer's happiness before it's too late.

In this competition, you'll work with hundreds of anonymized features to predict if a customer is satisfied or dissatisfied with their banking experience.

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv(r'data/train.csv')

In [3]:
# Identify columns with missing values
missing_cols = df_train.columns[df_train.isna().any()].tolist()

# Print the columns with missing values
#There is no missing values!
print(missing_cols)

[]


In [4]:
df_train.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [5]:
#There are no categorical columns
cat_cols = df_train.select_dtypes(include=['object']).columns.tolist()
cat_cols

[]

In [6]:
def correlation_matrix(chunk):
    """
    A function that calculates the correlation matrix for a chunk of data
    Parameters:
    - chunk : pandas dataframe : A chunk of data from a large dataset
    """
    return chunk.corr()

# Initialize an empty DataFrame to store the correlation matrix
corr = pd.DataFrame()

# Open the large dataset in chunks
for chunk in pd.read_csv(r"data/train.csv", chunksize=1000):
    # Calculate the correlation matrix for the current chunk
    corr_chunk = correlation_matrix(chunk)
    # Append the correlation matrix to the final DataFrame
    corr = corr.append(corr_chunk)

In [7]:
# #print the correlation matrix
# print(corr)

In [8]:
# Set the threshold
threshold = 0.22

selected_cols = corr['TARGET'].loc[(corr['TARGET']>threshold)&(corr['TARGET']!=1)].index.tolist()

In [9]:
selected_cols.append('TARGET')

In [10]:
len(selected_cols)

4

In [11]:
df_train = df_train[selected_cols]

In [12]:
# #Triangle Correlation Heatmap:
# plt.figure(figsize=(20, 12))
# # define the mask to set the values in the upper triangle to True
# mask = np.triu(np.ones_like(df_train.corr(), dtype=np.bool))
# heatmap = sns.heatmap(df_train.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
# heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

In [13]:
len(df_train.columns)

4

In [14]:
def remove_multicollinearity(data, threshold):
    # calculate the correlation matrix
    corr_matrix = data.corr().abs()
    
    # select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    
    # find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # drop correlated features
    data = data.drop(data[to_drop], axis=1)
    
    return data

In [15]:
df_train = remove_multicollinearity(df_train, threshold=0.8)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [16]:
len(df_train.columns)

3

In [17]:
# #Triangle Correlation Heatmap:
# plt.figure(figsize=(20, 12))
# # define the mask to set the values in the upper triangle to True
# mask = np.triu(np.ones_like(df_train.corr(), dtype=np.bool))
# heatmap = sns.heatmap(df_train.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
# heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

In [18]:
df_test = pd.read_csv(r'data/test.csv')

In [19]:
# Identify columns with missing values
missing_cols = df_test.columns[df_test.isna().any()].tolist()

# Print the columns with missing values
#There is no missing values!
print(missing_cols)

[]


In [20]:
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

def handle_unbalanced_target(X, y):
    # Separate majority and minority classes
    majority_class_indices = y[y==0].index
    minority_class_indices = y[y==1].index

    # Upsample minority class
    minority_class_X = X[minority_class_indices]
    minority_class_y = y[minority_class_indices]
    minority_class_X_upsampled, minority_class_y_upsampled = resample(minority_class_X, minority_class_y, 
                                                                    replace=True, n_samples=len(majority_class_indices), 
                                                                    random_state=123)

    # Combine majority class with upsampled minority class
    X_upsampled = np.concatenate([X[majority_class_indices], minority_class_X_upsampled])
    y_upsampled = pd.concat([y.loc[majority_class_indices], minority_class_y_upsampled])
    
    return X_upsampled, y_upsampled

In [21]:
X, y = handle_unbalanced_target(np.array(df_train.drop(['TARGET'], axis=1)), df_train['TARGET'])

In [22]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
def train_model(X_train_data, X_test_data, y_train_data, y_test_data):
    # Train model on upsampled data
    # Define the parameter grid
#     param_grid = {'n_estimators': [10, 50, 100, 200],
#                   'max_depth': [None, 5, 10, 20],
#                   'min_samples_split': [2, 5, 10],
#                   'min_samples_leaf': [1, 2, 4]}
    
    # Create the random forest classifier
    rf = RandomForestClassifier()
    
    # Create the grid search
#     grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    
    # Fit the grid search to the data
#     grid_search.fit(X_train_data, y_train_data)
    rf.fit(X_train_data, y_train_data)
    
    # Get the best estimator
#     best_estimator = grid_search.best_estimator_
    
    # Predict the test set labels
#     y_pred = best_estimator.predict(X_test_data)
    y_pred = rf.predict(X_test_data)
    
    # Calculate the accuracy
    accuracy = accuracy_score(y_test_data, y_pred)

    # Print classification report
    report = classification_report(y_test_data, y_pred)
    
    return rf, accuracy, report

In [24]:
best_estimator, accuracy, report = train_model(X_train, X_test, y_train, y_test)

In [25]:
best_estimator

RandomForestClassifier()

In [26]:
accuracy

0.6877931860982709

In [27]:
report

'              precision    recall  f1-score   support\n\n           0       0.77      0.54      0.63     14628\n           1       0.64      0.84      0.73     14577\n\n    accuracy                           0.69     29205\n   macro avg       0.71      0.69      0.68     29205\nweighted avg       0.71      0.69      0.68     29205\n'

In [28]:
df_test = df_test[df_train.columns[:-1]]

In [29]:
# best_estimator, accuracy = handle_unbalanced_target(np.array(df_train.drop(['TARGET'], axis=1)), df_train['TARGET'])

In [30]:
predictions = best_estimator.predict(df_test)

In [31]:
data = predictions.tolist()

In [32]:
sample_submission = pd.read_csv(r'data/sample_submission.csv')
sample_submission

Unnamed: 0,ID,TARGET
0,2,0
1,5,0
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,0


In [33]:
sample_submission['TARGET']=data
sample_submission

Unnamed: 0,ID,TARGET
0,2,1
1,5,1
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,1


In [34]:
sample_submission.to_csv("data/submission.csv", index=False)

In [35]:
df_submission = pd.read_csv(r'data/submission.csv')
df_submission

Unnamed: 0,ID,TARGET
0,2,1
1,5,1
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,1
