## Import Modules

In [176]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

## Loading the dataset

In [209]:
df = pd.read_csv('bank-additional.csv', sep=';')
df.head(20)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
5,32,services,single,university.degree,no,no,no,cellular,sep,thu,...,3,999,2,failure,-1.1,94.199,-37.5,0.884,4963.6,no
6,32,admin.,single,university.degree,no,yes,no,cellular,sep,mon,...,4,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,no
7,41,entrepreneur,married,university.degree,unknown,yes,no,cellular,nov,mon,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
8,31,services,divorced,professional.course,no,no,no,cellular,nov,tue,...,1,999,1,failure,-0.1,93.2,-42.0,4.153,5195.8,no
9,35,blue-collar,married,basic.9y,unknown,no,no,telephone,may,thu,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no


## Data Cleaning

In [178]:
# Check the structure of the dataset
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Drop any rows with missing values
df = df.dropna()

# Check for duplicate rows
print("Duplicate rows:", df.duplicated().sum())

# Remove duplicate rows
df = df.drop_duplicates()

# Check for inconsistent data types
print("Data types:")
print(df.dtypes)

# Convert categorical variables to appropriate data types
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']
df[categorical_columns] = df[categorical_columns].astype('category')

# Convert binary variables to 0s and 1s
df['default'] = df['default'].map({'no': 0, 'yes': 1, 'unknown': -2})
df['housing'] = df['housing'].map({'no': 0, 'yes': 1, 'unknown': -2})
df['loan'] = df['loan'].map({'no': 0, 'yes': 1, 'unknown': -2})
columns_to_fill = ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
df[columns_to_fill] = df[columns_to_fill].fillna(0.0)

# Check the cleaned dataset
print("Cleaned dataset shape:", df.shape)
print("Cleaned dataset columns:", df.columns)
print("Cleaned dataset data types:")
print(df.dtypes)


Dataset shape: (4119, 21)
Columns: Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
Missing values:
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
Duplicate rows: 0
Data types:
age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact  

## Data Preprocessing

In [180]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


# Separate features and target variable
X = df.drop('y', axis=1)
y = df['y']

# Encode categorical variables
categorical_columns = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']
encoder = LabelEncoder()
X[categorical_columns] = X[categorical_columns].apply(encoder.fit_transform)

# Scale numerical variables if necessary
numerical_columns = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
                     'cons.conf.idx', 'euribor3m', 'nr.employed']
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the processed dataset
print("Processed dataset:")
print(X_train.head())

# Save the processed dataset to new files
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)


Processed dataset:
           age  job  marital  education default housing loan  contact  month  \
324  -0.689832    1        1          3       0       1    0        1      6   
2350 -0.689832    6        2          6       0       0    0        1      6   
1391  1.152662    0        1          3       0       0    0        1      6   
468  -0.786805    7        2          3       0      -2   -2        0      6   
561  -0.107991   10        1          6       0       1    0        1      6   

      day_of_week  duration  campaign     pdays  previous  poutcome  \
324             4 -0.093406 -0.598660  0.201031 -0.351356         1   
2350            0 -0.647058 -0.598660  0.201031 -0.351356         1   
1391            4  1.516502 -0.209228  0.201031 -0.351356         1   
468             4  0.417053 -0.598660  0.201031  1.494607         0   
561             2 -0.348636 -0.209228  0.201031 -0.351356         1   

      emp.var.rate  cons.price.idx  cons.conf.idx  euribor3m  nr.employed

## Feature Engineering

In [182]:
import pandas as pd

# Read the processed dataset
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")

# Feature Engineering
# Example: Create a new feature 'contact_month' by combining 'contact' and 'month' columns
X_train['contact_month'] = X_train['contact'].astype(str) + '_' + X_train['month'].astype(str)
X_test['contact_month'] = X_test['contact'].astype(str) + '_' + X_test['month'].astype(str)

# Print the updated datasets
print("Updated training dataset:")
print(X_train.head())
print("\nUpdated testing dataset:")
print(X_test.head())

# Save the updated datasets to new files
X_train.to_csv("X_train_updated.csv", index=False)
X_test.to_csv("X_test_updated.csv", index=False)


Updated training dataset:
        age  job  marital  education  default  housing  loan  contact  month  \
0 -0.689832    1        1          3        0        1     0        1      6   
1 -0.689832    6        2          6        0        0     0        1      6   
2  1.152662    0        1          3        0        0     0        1      6   
3 -0.786805    7        2          3        0       -2    -2        0      6   
4 -0.107991   10        1          6        0        1     0        1      6   

   day_of_week  ...  campaign     pdays  previous  poutcome  emp.var.rate  \
0            4  ... -0.598660  0.201031 -0.351356         1      0.649441   
1            0  ... -0.598660  0.201031 -0.351356         1      0.649441   
2            4  ... -0.209228  0.201031 -0.351356         1      0.649441   
3            4  ... -0.598660  0.201031  1.494607         0     -1.206054   
4            2  ... -0.209228  0.201031 -0.351356         1      0.649441   

   cons.price.idx  cons.conf.i

## Model Selection

In [184]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the updated datasets
X_train = pd.read_csv("X_train_updated.csv")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test_updated.csv")
y_test = pd.read_csv("y_test.csv")

# Train and evaluate models
models = { "Logistic Regression": LogisticRegression(),"Random Forest": RandomForestClassifier()}

# Evaluate models using cross-validation
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring="accuracy")
    print(f"{model_name} Cross-Validation Accuracy: {scores.mean():.4f}")

# Tune hyperparameters of the selected model using GridSearchCV
param_grid = {
    "C": [0.1, 1.0, 10.0],
    "solver": ["liblinear", "saga"]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train.values.ravel())

print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the selected model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='yes')
recall = recall_score(y_test, y_pred, pos_label='yes')
f1 = f1_score(y_test, y_pred, pos_label='yes')

print("Test Set Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Logistic Regression Cross-Validation Accuracy: 0.9162
Random Forest Cross-Validation Accuracy: 0.9126
Best Hyperparameters: {'C': 0.1, 'solver': 'saga'}
Best Accuracy: 0.9171471927162367
Test Set Performance:
Accuracy: 0.9053
Precision: 0.6296
Recall: 0.3696
F1 Score: 0.4658


 ## Creating Pipeline

In [185]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the Bank Marketing dataset
data = pd.read_csv("bank-additional.csv", delimiter=";")

# Separate features and target variable
X = data.drop('y', axis=1)
y = data['y']

# Define categorical and numerical column names
categorical_columns = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']
numerical_columns = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
                     'cons.conf.idx', 'euribor3m', 'nr.employed']

# Define preprocessing steps for categorical and numerical features
preprocessing = ColumnTransformer([
    ('cat', OneHotEncoder(), categorical_columns),
    ('num', StandardScaler(), numerical_columns)
])

# Define the pipeline
pipeline = Pipeline([
    ('preprocess', preprocessing),
    ('model', RandomForestClassifier())
])

# Define hyperparameter grid for tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Print the best hyperparameters and corresponding accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the best model on the entire dataset
y_pred = grid_search.predict(X)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, pos_label='yes')
recall = recall_score(y, y_pred, pos_label='yes')
f1 = f1_score(y, y_pred, pos_label='yes')

print("Overall Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Best Hyperparameters: {'model__max_depth': None, 'model__n_estimators': 100}
Best Accuracy: nan
Overall Performance:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


In [206]:
y_train['y'] = y_train['y'].map({'no': 0, 'yes': 1})

In [197]:
model = RandomForestRegressor()
model.fit(X_train,y_train)


RandomForestRegressor()

In [198]:
testing_data_prediction = model.predict(X_test)

In [207]:
y_test['y'] = y_test['y'].map({'no': 0, 'yes': 1})

In [204]:
score = metrics.r2_score(y_test,testing_data_prediction)
score

0.3575384176764078

In [205]:
import pickle

filename = 'finalpro.sav'
pickle.dump(model, open(filename,'wb')) 