In [4]:
import pandas as pd

# Load the dataset from the CSV file
file_path = 'bank-full.csv'
df = pd.read_csv(file_path, sep=';')

# Select the specified columns
columns_to_use = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]
df_selected = df[columns_to_use]

# Check for missing values
missing_values = df_selected.isnull().sum()

# Display the missing values
print("Missing values in each feature:")
print(missing_values)


Missing values in each feature:
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


**Question 1:** 
What is the most frequent observation (mode) for the column education?

In [6]:
# Find the mode of the 'education' column
education_mode = df_selected['education'].mode()[0]

# Display the result
print(f"The most frequent observation (mode) for the education column is: {education_mode}")


The most frequent observation (mode) for the education column is: secondary


**Question 2:** What are the two features that have the biggest correlation?

In [7]:
# Select numerical features from the dataset
numerical_features = df_selected[['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]

# Compute the correlation matrix
correlation_matrix = numerical_features.corr()

# Display the correlation matrix
print("Correlation matrix:")
print(correlation_matrix)

# Find the pair of features with the highest correlation
correlation_pairs = correlation_matrix.unstack().sort_values(kind="quicksort", ascending=False)

# Exclude the diagonal (correlation of features with themselves)
correlation_pairs = correlation_pairs[correlation_pairs < 1]

# Display the two features with the highest correlation
highest_correlation = correlation_pairs.idxmax()
highest_correlation_value = correlation_pairs.max()

print(f"The two features with the biggest correlation are: {highest_correlation} with a correlation of {highest_correlation_value:.2f}")


Correlation matrix:
               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
The two features with the biggest correlation are: ('pdays', 'previous') with a correlation of 0.45


**Target encoding**

In [10]:
# Replace 'yes' with 1 and 'no' with 0 in the 'y' column
df_selected.loc[:, 'y'] = df_selected['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Display the first few rows to confirm the encoding
df_selected[['y']].head()



Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0


In [17]:
# Check the unique values in the 'y' column before encoding
df_selected_original = pd.read_csv(file_path, sep=';')
df_selected_original['y'].value_counts()


y
no     39922
yes     5289
Name: count, dtype: int64

In [18]:
# Check unique values in the 'y' column
unique_values = df_selected['y'].unique()

# Display the unique values
print("Unique values in the 'y' column:", unique_values)


Unique values in the 'y' column: [0 1]


**Split the data**

In [19]:
from sklearn.model_selection import train_test_split

# Separate the features (X) from the target (y)
X = df_selected.drop(columns=['y'])
y = df_selected['y']

# First, split into 60% training and 40% remaining data
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.4, random_state=42)

# Then, split the remaining 40% into 20% validation and 20% test data
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)

# Display the shape of each split to confirm
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (27126, 16) (27126,)
Validation set shape: (9042, 16) (9042,)
Test set shape: (9043, 16) (9043,)


**Question 3:** Which variable has the biggest mutual information score?

In [21]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Select categorical features from the training set
categorical_features = ['contact', 'education', 'housing', 'poutcome']
X_train_categorical = X_train[categorical_features]

# One-hot encode categorical variables
X_train_encoded = pd.get_dummies(X_train_categorical, drop_first=True)

# Calculate the mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)

# Create a DataFrame to display the scores alongside the features
mi_scores_df = pd.DataFrame({
    'Feature': X_train_encoded.columns,
    'Mutual Information Score': mi_scores
})

# Extract the original categorical variable names by splitting on the first underscore
mi_scores_df['Variable'] = mi_scores_df['Feature'].str.split('_').str[0]

# Group by the original categorical variables and sum their MI scores
mi_scores_summed = mi_scores_df.groupby('Variable')['Mutual Information Score'].sum()

# Round the scores to 2 decimals
mi_scores_summed = mi_scores_summed.round(2)

# Sort the scores in descending order to find the variable with the highest MI score
mi_scores_summed_sorted = mi_scores_summed.sort_values(ascending=False)

# Display the mutual information scores
print("Mutual Information Scores:")
print(mi_scores_summed_sorted)

# Identify the variable with the highest mutual information score
highest_mi_variable = mi_scores_summed_sorted.idxmax()
highest_mi_score = mi_scores_summed_sorted.max()

print(f"\nThe variable with the highest mutual information score is '{highest_mi_variable}' with a score of {highest_mi_score}.")

Mutual Information Scores:
Variable
poutcome     0.04
housing      0.02
education    0.01
contact      0.01
Name: Mutual Information Score, dtype: float64

The variable with the highest mutual information score is 'poutcome' with a score of 0.04.


**Question 4:** What accuracy did you get?

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Step 1: One-hot encode categorical variables in the training and validation sets
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# One-hot encode categorical variables
X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_features, drop_first=True)

# Ensure the same columns are present in both the training and validation sets (in case any categories are missing)
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Convert 'default' and 'loan' columns from 'yes'/'no' to 1/0
X_train_encoded['default'] = X_train_encoded['default'].apply(lambda x: 1 if x == 'yes' else 0)
X_train_encoded['loan'] = X_train_encoded['loan'].apply(lambda x: 1 if x == 'yes' else 0)

X_val_encoded['default'] = X_val_encoded['default'].apply(lambda x: 1 if x == 'yes' else 0)
X_val_encoded['loan'] = X_val_encoded['loan'].apply(lambda x: 1 if x == 'yes' else 0)

# Check the data types after conversion
print("Data types in training set after conversion:")
print(X_train_encoded.dtypes)

# Step 2: Train the logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Step 3: Predict and calculate the accuracy on the validation set
y_val_pred = model.predict(X_val_encoded)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_val_pred)

# Round the accuracy to 2 decimal places
accuracy_rounded = round(accuracy, 2)

# Display the result
print(f"The accuracy on the validation dataset is: {accuracy_rounded}")


Data types in training set after conversion:
age                    int64
default                int64
balance                int64
loan                   int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_blue-collar         bool
job_entrepreneur        bool
job_housemaid           bool
job_management          bool
job_retired             bool
job_self-employed       bool
job_services            bool
job_student             bool
job_technician          bool
job_unemployed          bool
job_unknown             bool
marital_married         bool
marital_single          bool
education_secondary     bool
education_tertiary      bool
education_unknown       bool
housing_yes             bool
contact_telephone       bool
contact_unknown         bool
month_aug               bool
month_dec               bool
month_feb               bool
month_jan               bool
month_jul               boo

**Question 5:** Which feature has the smallest difference?

In [34]:
import re

# Function to drop all columns related to a specific feature (for one-hot encoded variables)
def drop_feature_columns(feature, X):
    # Use regular expressions to find all columns related to the feature
    columns_to_drop = [col for col in X.columns if re.match(f"^{feature}_", col) or col == feature]
    return X.drop(columns=columns_to_drop)

# Step 1: Train a baseline model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and calculate the baseline accuracy on the validation set
y_val_pred = model.predict(X_val_encoded)
baseline_accuracy = accuracy_score(y_val, y_val_pred)

# Initialize a dictionary to store the accuracies without each feature
accuracy_diffs = {}

# List of features to test (age, balance, marital, previous)
features_to_test = ['age', 'balance', 'marital', 'previous']

# Step 2: Iterate over each feature, exclude it, and train a model without it
for feature in features_to_test:
    # Drop the feature and all associated one-hot encoded columns
    X_train_without_feature = drop_feature_columns(feature, X_train_encoded)
    X_val_without_feature = drop_feature_columns(feature, X_val_encoded)
    
    # Train a new model without the selected feature
    model_without_feature = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_without_feature.fit(X_train_without_feature, y_train)
    
    # Predict and calculate the accuracy on the validation set
    y_val_pred_without_feature = model_without_feature.predict(X_val_without_feature)
    accuracy_without_feature = accuracy_score(y_val, y_val_pred_without_feature)
    
    # Calculate the difference between the baseline accuracy and the accuracy without the feature
    accuracy_diff = baseline_accuracy - accuracy_without_feature
    
    # Store the difference in the dictionary
    accuracy_diffs[feature] = accuracy_diff

# Step 3: Display the differences in accuracy for each feature
print("Accuracy differences for each feature:")
for feature, diff in accuracy_diffs.items():
    print(f"{feature}: {round(diff, 4)}")

# Step 4: Find the feature with the smallest difference
least_useful_feature = min(accuracy_diffs, key=accuracy_diffs.get)
smallest_difference = accuracy_diffs[least_useful_feature]

print(f"\nThe feature with the smallest difference in accuracy is '{least_useful_feature}' with a difference of {round(smallest_difference, 4)}.")


Accuracy differences for each feature:
age: 0.0
balance: -0.0003
marital: -0.0012
previous: -0.0012

The feature with the smallest difference in accuracy is 'marital' with a difference of -0.0012.


**Question 6:** Which of these C leads to the best accuracy on the validation set?

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# List of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store the accuracies for each C value
accuracies = {}

# Iterate over each value of C
for C in C_values:
    # Train the logistic regression model with the current value of C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    
    # Predict and calculate the accuracy on the validation set
    y_val_pred = model.predict(X_val_encoded)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Round the accuracy to 3 decimal places
    accuracy_rounded = round(accuracy, 3)
    
    # Store the accuracy in the dictionary
    accuracies[C] = accuracy_rounded

# Display the accuracies for each value of C
print("Validation accuracy for each C value:")
for C, accuracy in accuracies.items():
    print(f"C = {C}: Accuracy = {accuracy}")

# Find the value of C with the highest accuracy
best_C = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_C]

print(f"\nThe value of C that leads to the best accuracy on the validation set is {best_C} with an accuracy of {best_accuracy}.")


Validation accuracy for each C value:
C = 0.01: Accuracy = 0.897
C = 0.1: Accuracy = 0.899
C = 1: Accuracy = 0.9
C = 10: Accuracy = 0.901
C = 100: Accuracy = 0.901

The value of C that leads to the best accuracy on the validation set is 10 with an accuracy of 0.901.


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd

# List of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store the accuracies for each C value
accuracies = []

# Step 1: Convert training and validation sets to dictionary format
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Convert the selected features to dictionaries for DictVectorizer
train_dicts = X_train[categorical_features + numerical_features].to_dict(orient='records')
val_dicts = X_val[categorical_features + numerical_features].to_dict(orient='records')

# Step 2: Use DictVectorizer to convert dictionaries to numeric arrays
dv = DictVectorizer(sparse=False)

# Fit on the training data and transform both training and validation sets
X_train_transformed = dv.fit_transform(train_dicts)
X_val_transformed = dv.transform(val_dicts)

# Step 3: Iterate over each value of C and train a logistic regression model
for C in C_values:
    # Train the logistic regression model with the current value of C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_transformed, y_train)
    
    # Predict probabilities on the validation set and classify using a 0.5 threshold
    y_val_pred_prob = model.predict_proba(X_val_transformed)[:, 1]
    y_val_pred = (y_val_pred_prob >= 0.5).astype(int)

    # Calculate accuracy on the validation set
    accuracy = (y_val == y_val_pred).mean()
    
    # Append the rounded accuracy to the list
    accuracies.append(round(accuracy, 3))

    # Print the accuracy for the current value of C
    print(f"The accuracy on validation set using C = {C} is {accuracy:.3f}.")

# Step 4: Find the best value of C based on accuracy
best_C_index = accuracies.index(max(accuracies))
best_C = C_values[best_C_index]
best_accuracy = accuracies[best_C_index]

print(f"\nThe value of C that leads to the best accuracy on the validation set is {best_C} with an accuracy of {best_accuracy:.3f}.")


The accuracy on validation set using C = 0.01 is 0.899.
The accuracy on validation set using C = 0.1 is 0.900.
The accuracy on validation set using C = 1 is 0.901.
The accuracy on validation set using C = 10 is 0.901.
The accuracy on validation set using C = 100 is 0.901.

The value of C that leads to the best accuracy on the validation set is 1 with an accuracy of 0.901.
