In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv(r"C:\Users\hillarik\Desktop\MLzoomcamp\Classification\bank-full.csv", sep=';')


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
columns_to_keep = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 
    'contact', 'day', 'month', 'duration', 'campaign', 
    'pdays', 'previous', 'poutcome', 'y']
df_selected = df[columns_to_keep]

Checking Missing values

In [5]:
df_selected.isnull().sum()


age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

Question one

In [6]:
education_mode = df_selected['education'].mode()[0]
print(f"Most frequent education: {education_mode}")




Most frequent education: secondary


Question two-Correlation Matrix for Numerical Features

In [7]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
correlation_matrix = df[numerical_columns].corr()
print(correlation_matrix)
highest_corr = correlation_matrix.unstack().sort_values(ascending=False)
highest_corr = highest_corr[highest_corr != 1]
print(highest_corr.head(1))


               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
previous  pdays    0.45482
dtype: float64


Target Encoding

In [8]:
df_selected.loc[:, 'y'] = df_selected['y'].map({'yes': 1, 'no': 0})


Split

In [24]:
from sklearn.model_selection import train_test_split


In [25]:
X = df_selected.drop(columns=['y'])
y = df_selected['y']


In [40]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 
            'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
target = 'y'
X = df[features]   # Features (without 'y')
y = df[target] 
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)



(27126, 9042, 9043)

In [41]:
len(df_train), len(df_val), len(df_test)


(27126, 9042, 9043)

In [42]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [43]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

categorical_columns = ['contact', 'education', 'housing', 'poutcome']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le  
mi_scores = mutual_info_classif(X_train[categorical_columns], y_train, discrete_features=True)

mi_scores_rounded = [round(score, 2) for score in mi_scores]
mi_scores_rounded_df = pd.DataFrame({'Feature': categorical_columns, 'MI Score': mi_scores_rounded})

mi_scores_rounded_df_sorted = mi_scores_rounded_df.sort_values(by='MI Score', ascending=False)
print(mi_scores_rounded_df_sorted)
most_influential_feature = mi_scores_rounded_df_sorted.iloc[0]['Feature']
print(f'The feature with the highest MI score is: {most_influential_feature}')


     Feature  MI Score
3   poutcome      0.03
0    contact      0.01
2    housing      0.01
1  education      0.00
The feature with the highest MI score is: poutcome


QUESTION 4

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [45]:
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [60]:
for column in categorical_columns:
    print(f"{column} - Training Unique Categories: {X_train[column].unique()}")
    print(f"{column} - Validation Unique Categories: {X_val[column].unique()}\n")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough')
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))])
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy_rounded = round(accuracy, 2)
print(f"Validation Accuracy: {accuracy_rounded}")


job - Training Unique Categories: ['blue-collar' 'technician' 'admin.' 'management' 'services' 'unemployed'
 'housemaid' 'retired' 'entrepreneur' 'unknown' 'student' 'self-employed']
job - Validation Unique Categories: ['blue-collar' 'technician' 'services' 'admin.' 'management'
 'self-employed' 'retired' 'unknown' 'unemployed' 'student' 'entrepreneur'
 'housemaid']

marital - Training Unique Categories: ['married' 'single' 'divorced']
marital - Validation Unique Categories: ['married' 'single' 'divorced']

education - Training Unique Categories: [0 1 2 3]
education - Validation Unique Categories: ['secondary' 'primary' 'tertiary' 'unknown']

housing - Training Unique Categories: [1 0]
housing - Validation Unique Categories: ['yes' 'no']

contact - Training Unique Categories: [2 0 1]
contact - Validation Unique Categories: ['cellular' 'unknown' 'telephone']

month - Training Unique Categories: ['may' 'jul' 'jun' 'apr' 'aug' 'nov' 'sep' 'feb' 'oct' 'jan' 'dec' 'mar']
month - Validation 



QUESTION 5

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

# Step 1: Define the features and prepare the data
features = ['age', 'balance', 'marital', 'previous']
train_dict = df_train[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')

# Step 2: Vectorize the features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# Assuming y_train and y_val are defined and consistent with the data
# Step 3: Train the initial model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
original_accuracy = accuracy_score(y_val, y_pred)
print('Original Accuracy:', original_accuracy)

ValueError: Found input variables with inconsistent numbers of samples: [27126, 36168]

    QUESTION 6

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load your dataset
# Make sure to adjust the path to where your dataset is located
df = pd.read_csv(r"C:\Users\hillarik\Desktop\MLzoomcamp\Classification\bank-full.csv", sep=';')

# Preprocess your data
# Map target variable to binary
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Define features and target
features = df.columns.drop('y')  # Use all columns except the target
X = df[features]
y = df['y']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert categorical features to a suitable format
dv = DictVectorizer(sparse=False)
X_train_dict = X_train.to_dict(orient='records')
X_val_dict = X_val.to_dict(orient='records')
X_train_vectorized = dv.fit_transform(X_train_dict)
X_val_vectorized = dv.transform(X_val_dict)

# Regularization parameter values
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

# Iterate over C values to train the model and evaluate accuracy
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_vectorized, y_train)
    
    # Predict probabilities on the validation set
    y_pred_proba = model.predict_proba(X_val_vectorized)[:, 1]
    
    # Convert probabilities to binary predictions
    y_pred = y_pred_proba >= 0.5
    
    # Calculate accuracy
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    print(f'Accuracy for C={C}: {accuracy:.3f}')
    
    # Keep track of the best accuracy and corresponding C value
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

print(f'Best C: {best_C} with accuracy: {best_accuracy:.3f}')


Accuracy for C=0.01: 0.897
Accuracy for C=0.1: 0.899
Accuracy for C=1: 0.899
Accuracy for C=10: 0.899
Accuracy for C=100: 0.899
Best C: 0.1 with accuracy: 0.899
