In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_validate 
import joblib

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("Dataset7.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset7.csv'

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"], axis=1, inplace=True)

In [None]:
numeric_columns = ["age", "duration", "campaign", "pdays", "previous"]
for col in numeric_columns:
    df[col] = df[col].astype(float)

In [None]:
df.info()

In [None]:
for col in df.select_dtypes(include="object"):
    print(df[col].value_counts(), end="\n-------------------------------\n")

In [None]:
for col in ['job', 'education', 'housing', 'loan']:
    df.loc[df[col]=='unknown', col] = np.nan

In [None]:
df.drop(df[df['marital'] == 'unknown'].index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop("default", axis=1, inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
df.describe()

In [None]:
px.histogram(df, x='age', marginal='box')

In [None]:
df['duration'] = df["duration"].apply(lambda x: x/60)

In [None]:
px.histogram(df, x='duration', marginal='box')

In [None]:
df[df["duration"] > 25].shape

In [None]:
df.drop(df[df["duration"] > 25].index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
px.histogram(df, x='duration', marginal='box')

In [None]:
px.histogram(df, x='campaign', marginal='box', color='y')

In [None]:
df[df["campaign"] > 15].shape

In [None]:
df.drop(df[df["campaign"] > 15].index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
px.histogram(df, x='campaign', marginal='box', color='y')

In [None]:
px.histogram(df, x='pdays', marginal='box')

In [None]:
df.drop('pdays', axis=1, inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
px.histogram(df, x='previous', marginal='box', color='y')

In [None]:
df[df["previous"] == 0].shape

In [None]:
df['previous'] = df['previous'].apply(lambda x : 1 if x > 0 else 0)

In [None]:
px.histogram(df, x='previous', marginal='box', color='y')

In [None]:
px.histogram(df, x='poutcome', marginal='box', color='y')

In [None]:
print(df["previous"].value_counts(), end="\n-----------\n")
print(df["poutcome"].value_counts())

In [None]:
df.drop('previous', axis=1, inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=["housing", "job"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.education.value_counts()

In [None]:
px.histogram(df, x='education', color="y")

In [None]:
education_mapping = {
    'university.degree': 6,
    'professional.course': 5,
    'high.school': 4,
    'basic.9y': 3,
    'basic.4y': 2,
    'basic.6y': 1,
    'illiterate' : 0
}
reversed_education_mapping = {v: k for k, v in education_mapping.items()}

In [None]:
imputer = KNNImputer(n_neighbors=5)

In [None]:
df['education'] = df['education'].map(education_mapping)

In [None]:
features_for_imputation = ['age', 'education']
df_imputed_subset = pd.DataFrame(imputer.fit_transform(df[features_for_imputation]), columns=features_for_imputation)

In [None]:
df[features_for_imputation] = df_imputed_subset

In [None]:
df['education'] = df['education'].apply(round).map(reversed_education_mapping)

In [None]:
df['education'].value_counts()

In [None]:
df.select_dtypes(exclude='object').corr()

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.dropna(axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
#oh_encoder = OneHotEncoder(sparse_output=False, drop='first')
#encoded_array = oh_encoder.fit_transform(df[['job', 'marital', 'contact', 'poutcome', 'month', 'day_of_week']])
#encoded_df = pd.DataFrame(encoded_array, columns=oh_encoder.get_feature_names_out(['job', 'marital', 'contact', 'poutcome', 'month', 'day_of_week']))
#df = df.drop(['job', 'marital', 'contact', 'poutcome', 'month', 'day_of_week'], axis=1).join(encoded_df)

In [None]:
df['y'] = df['y'].map(lambda x : 1 if x == "yes" else 0)

In [None]:
column_transformer = ColumnTransformer([("ohe", OneHotEncoder(sparse_output=False, drop='first'), ['job', 'marital', 'contact', 'poutcome', 'month', 'day_of_week']),
                                        ("oe_education",OrdinalEncoder(categories=[['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']]), ['education']),
                                        ("oe_housing", OrdinalEncoder(categories=[['no', 'yes']]), ["housing"]),
                                        ("oe_loan", OrdinalEncoder(categories=[['no', 'yes']]), ["loan"]),
                                        ],
                                        remainder='passthrough')

In [None]:
param_grids = {
    "TREE": {
        'model__max_depth': range(1, 30),
        'model__splitter': ['best', 'random'],
        'model__criterion': ['gini', 'entropy']
    },
    "SVM": {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'model__gamma': ['scale', 'auto']
    },
    "MNB": {
        'model__alpha': [0.1, 1, 10],
        'model__fit_prior': [True, False]
    },
    "GNB": {
        'model__var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    "LR": {
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__max_iter': [100, 200, 300]
    },
    "KNN": {
        'model__n_neighbors': [5, 10, 15, 20],
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['euclidean', 'manhattan', 'minkowski']
    },
    "XG": {
        'model__n_estimators': [20, 50, 100],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    "RF": {
        'model__n_estimators': [20, 50, 100],
        'model__criterion': ['gini', 'entropy'],
        'model__max_features': ['auto', 'sqrt', 'log2']
    },
    "Voting": {
        'model__voting': ['hard', 'soft'],
        'model__weights': [[1, 1, 1], [2, 1, 1], [1, 2, 1], [1, 1, 2]]
    }
}

In [None]:
X = df.drop('y', axis=1)
y = df['y']

In [None]:
models = list()
models.append(("TREE" , DecisionTreeClassifier(criterion='entropy', max_depth= 2, splitter='random')))
models.append(("SVM" , SVC(C=0.1, gamma='auto', kernel='rbf')))
models.append(("LR" , LogisticRegression(C=0.1, max_iter= 100, solver= 'newton-cg')))
models.append(("KNN" , KNeighborsClassifier(metric='manhattan', n_neighbors=20, weights='uniform')))
models.append(("XG",XGBClassifier(learning_rate= 0.01, max_depth= 3, n_estimators= 20)))
models.append(("RF",RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators= 20)))
models.append(("Voting",VotingClassifier(estimators=[("TREE" , DecisionTreeClassifier()), ("LR" , LogisticRegression()), ("KNN" , KNeighborsClassifier())], voting='hard', weights= [1, 2, 1])))

In [None]:
X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)

In [None]:
for model_name, model in models:
    pipeline = Pipeline([
        ('column_transformer', column_transformer),
        ('scaler', MinMaxScaler()),
        ('model', model)
    ])

    
    scores = cross_validate(pipeline, X, y, cv=5, scoring="accuracy", return_train_score=True)
    
    print(model_name)
    print("Train accuracy is ", scores['train_score'].mean())
    print("Test accuracy is ", scores['test_score'].mean())
    print("*" * 50)

In [None]:
joblib.dump(column_transformer, 'column_transformer.pkl')

In [None]:
pipeline = Pipeline([
        ('column_transformer', column_transformer),
        ('scaler', MinMaxScaler()),
        ('model', LogisticRegression(C=0.1, max_iter= 100, solver= 'newton-cg'))
    ])
pipeline.fit(X,y)
joblib.dump(pipeline, 'logistic_regression_pipeline.pkl')

pipeline = Pipeline([
        ('column_transformer', column_transformer),
        ('scaler', MinMaxScaler()),
        ('model', DecisionTreeClassifier(criterion='entropy', max_depth= 2, splitter='random'))
    ])
pipeline.fit(X,y)
joblib.dump(pipeline, 'decision_tree_pipeline.pkl')

pipeline = Pipeline([
        ('column_transformer', column_transformer),
        ('scaler', MinMaxScaler()),
        ('model', KNeighborsClassifier(metric='manhattan', n_neighbors=20, weights='uniform'))
    ])
pipeline.fit(X,y)
joblib.dump(pipeline, 'knn_pipeline.pkl')
