In [None]:
import pandas as pd
import numpy as np
import zipfile
import requests
import io
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Download the dataset
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

# Step 2: Load the dataset into a pandas DataFrame
df = pd.read_csv('bank-full.csv', sep=';')

# Inspect the dataset
print(df.head())
print(df.info())

# Q1: Mode for the column `education`
education_mode = df['education'].mode()[0]
print(f"Mode of the 'education' column: {education_mode}")

# Q2: Two features with the biggest correlation
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

# Calculate correlation between the given pairs
correlation_values = {
    'age_balance': abs(df['age'].corr(df['balance'])),
    'day_campaign': abs(df['day'].corr(df['campaign'])),
    'day_pdays': abs(df['day'].corr(df['pdays'])),
    'pdays_previous': abs(df['pdays'].corr(df['previous']))
}

biggest_correlation_pair = max(correlation_values, key=correlation_values.get)
print(f"Two features with the biggest correlation: {biggest_correlation_pair}")

# Q3: Variable with the biggest mutual information score
X = df.drop(columns=['y'])  # Independent variables
y = LabelEncoder().fit_transform(df['y'])  # Dependent variable (converted to binary)

# Convert categorical variables to numeric using Label Encoding
X_encoded = X.apply(LabelEncoder().fit_transform)

# Calculate mutual information
mi_scores = mutual_info_classif(X_encoded, y)

# Get the variable names
mi_series = pd.Series(mi_scores, index=X.columns)
mi_series_sorted = mi_series.sort_values(ascending=False)
print(f"Variable with the biggest mutual information score: {mi_series_sorted.idxmax()}")

# Q4: Accuracy on the validation dataset
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy on the validation dataset: {accuracy:.2f}")

# Q5: Feature with the smallest difference (in mean)
differences = {}
for col in ['age', 'balance', 'marital', 'previous']:
    diff = abs(df[df['y'] == 'yes'][col].mean() - df[df['y'] == 'no'][col].mean())
    differences[col] = diff

smallest_difference_feature = min(differences, key=differences.get)
print(f"Feature with the smallest difference: {smallest_difference_feature}")

# Q6: Smallest `C` that leads to the best accuracy on the validation set
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"C={C}, Accuracy={accuracy:.2f}")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

print(f"Smallest C that leads to the best accuracy: {best_C}")
