# Loading the Data

In [None]:
import pandas as pd
import numpy as np
X_train  = pd.read_csv("/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv")
X_test = pd.read_csv("/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv")
y_train = X_train.pop("target")

# Data Preprocessing

## Getting Some Information about the data

In [None]:
X_train.info()

In [None]:
X_train.head()

From above we can see that the columns ('job','marital','education','default','housing','loan','contact','poutcome') are categorical and the others ('last contact date','age','balance','duration','campaign','pdays','previous') are mostly numeric

## Imputing Missing Values

First let's check for the missing values

In [None]:
X_train.isna().sum()

In [None]:
X_test.isna().sum()

Both the X_train and X_test have missing values present in the columns ('job','education','contact','poutcome'). Since, all these features are categorical we will use `SimpleImputer` with `strategy='constant'` 

In [None]:
X_train['job'].value_counts()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
si = SimpleImputer(strategy="constant",fill_value="other")
ct = ColumnTransformer(transformers=[
    ('impute', si, ['job','education','contact','poutcome'])
])
trans_cols_train = ct.fit_transform(X_train)
trans_cols_test = ct.transform(X_test)
X_train[['job','education','contact','poutcome']] = trans_cols_train
X_test[['job','education','contact','poutcome']] = trans_cols_test

In [None]:
X_train.isna().sum().sum()

In [None]:
X_test.isna().sum().sum()

As we can see the `SimpleImputer` imputed all the missing values successfully !!!

# Explanatory Data Analysis

## Descriptive Statitical Analysis

In [None]:
X_train.describe()

In [None]:
y_train.value_counts()

## Data Vizualization

### Vizualizing Correlation Between The Features

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 8)) 
import seaborn as sns
sns.heatmap(X_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'],axis=1).iloc[:,1:].corr(), annot=True, fmt=".2f")

In [None]:
X_train.hist(bins=100,figsize=(12,12));

## Encoding Categorical Features 
We will use the `OrdinalEncoder` for encoding categorical variables as they are all nominal in nature

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
ct = ColumnTransformer(transformers=[
    ('oe', oe, ['job','marital','education','default','housing','loan','contact','poutcome'])
])
trans_cols_train = ct.fit_transform(X_train)
trans_cols_test = ct.transform(X_test)

X_train.loc[:,['job','marital','education','default','housing','loan','contact','poutcome']] = trans_cols_train
X_test.loc[:,['job','marital','education','default','housing','loan','contact','poutcome']] = trans_cols_test
new_cols = ct.named_transformers_['oe'].get_feature_names_out(['job','marital','education','default','housing','loan','contact','poutcome'])

In [None]:
y_train = y_train.map({'yes': 1, 'no': 0})

## Dropping Unecessary Features

In [None]:
X_train.drop("last contact date",axis=1,inplace=True)
X_test.drop("last contact date",axis=1,inplace=True)

## Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train = pd.DataFrame(stdsc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(stdsc.transform(X_test),columns=X_train.columns)

# Handling Imbalance in the Data

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=21)
X_res, y_res = sm.fit_resample(X_train, y_train)
X_res.shape

# Model - 1 Logistic Regression & Logistic Regression with CV

In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
X_train_sub, X_train_val, y_train_sub, y_train_val = train_test_split(X_res, y_res, train_size=0.5, random_state=21)

In [None]:
lr = LogisticRegressionCV()
Cs = np.logspace(-3, 3, 21)
l1_ratios = np.linspace(0, 1, 12)
log_reg_cv = LogisticRegressionCV(
    penalty = 'elasticnet',
    solver = 'saga',
    max_iter = 10000,
    Cs = Cs,
    l1_ratios = l1_ratios,
    cv = 6,
    n_jobs = -1)
log_reg_cv.fit(X_train_sub, y_train_sub);

In [None]:
log_reg_cv.C_, log_reg_cv.l1_ratio_

In [None]:
y_pred = log_reg_cv.predict(X_train_val)
f1_score(y_train_val,y_pred)

# Model 2 - Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(X_res,y_res)

In [None]:
y_pred = gb.predict(X_res)
f1_score(y_res,y_pred,average="macro")

# Model 3 - Stochastic Gradient Descent Classifier 

We will be using the `SGDClassifier` with Hyper-Parameter tuning using `GridSearchCV`

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
sgd = SGDClassifier(random_state=21)
param_grid = {
    'loss': ['hinge', 'log_loss', 'perceptron'],
    'max_iter': [1000, 2000, 3000],
    'fit_intercept': [True, False]
}
gcv = GridSearchCV(param_grid=param_grid, estimator=sgd)
gcv.fit(X_res,y_res)
y_pred = gcv.predict(X_res)

In [None]:
gcv.best_params_

In [None]:
f1_score(y_res,y_pred,average="macro")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_res,y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

## Model-4 Using `RandomForestClassifier`

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [400,500,600],
    'max_depth': [40,50,60,70]
}

rf_grid_search = RandomizedSearchCV(RandomForestClassifier(), param_grid)
rf_grid_search.fit(X_train_sub, y_train_sub)

In [None]:
rf_grid_search.best_params_

In [None]:
y_pred = rf_grid_search.predict(X_train_val)
f1_score(y_train_val, y_pred,average="macro")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_train_sub,y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

In [None]:
y_pred_test = rf_grid_search.predict(X_test)
final_ans = pd.DataFrame({"id":X_test.index, "target":y_pred_test})
final_ans['target'] = final_ans['target'].map({0: 'no', 1: 'yes'})
final_ans.to_csv('submission.csv', index=False)