In [25]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv('bank-full.csv', sep=';')

In [8]:
select_columns = [
    "age", "job", "marital", "education", "balance", "housing", 
    "contact", "day", "month", "duration", "campaign", 
    "pdays", "previous", "poutcome", "y"
]

In [11]:
df = df[select_columns]

In [13]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [14]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [17]:
numeric_col = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']

In [18]:
# Replacing 'yes'/'no' in the 'y' column with 1/0
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

# Computing the correlation matrix for numerical features
correlation_matrix = df[numeric_col].corr()

# Finding the two features with the highest correlation
biggest_correlation = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()
highest_corr_pair = biggest_correlation[biggest_correlation < 1].idxmax()

highest_corr_pair, biggest_correlation[highest_corr_pair]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].replace({'yes': 1, 'no': 0})


(('previous', 'pdays'), 0.4548196354805043)

In [19]:
correlation_matrix

Unnamed: 0,age,balance,day,campaign,pdays,previous
age,1.0,0.097783,-0.00912,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,0.16249,-0.093044,-0.05171
campaign,0.00476,-0.014578,0.16249,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,-0.032855,0.45482,1.0


In [20]:
from sklearn.model_selection import train_test_split

# Separating the features and the target variable
X = df.drop(columns=['y'])
y = df['y']

# Splitting the data into train (60%), val (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Showing the shapes of the resulting splits
(X_train.shape, X_val.shape, X_test.shape), (y_train.shape, y_val.shape, y_test.shape)

(((27126, 14), (9042, 14), (9043, 14)), ((27126,), (9042,), (9043,)))

In [21]:
categorical_cols = df.select_dtypes(include=['object']).columns


In [24]:
from sklearn.preprocessing import LabelEncoder

# Applying label encoding to the categorical columns in the training set
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le  # Saving the encoder for future use

# Computing the mutual information score again after encoding
mi_scores = mutual_info_classif(X_train[categorical_cols], y_train)

# Creating a dictionary of feature names and their mutual information scores
mi_scores_dict = {col: round(score, 2) for col, score in zip(categorical_cols, mi_scores)}

mi_scores_dict

{'job': 0.0,
 'marital': 0.01,
 'education': 0.0,
 'housing': 0.01,
 'contact': 0.01,
 'month': 0.02,
 'poutcome': 0.03}

In [29]:
# Modifying the encoder to handle unknown categories
encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')

# One-hot encoding the categorical columns for the training set
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])

# Combining encoded categorical features with the numerical features for the training set
X_train_combined = pd.concat([pd.DataFrame(X_train_encoded), X_train.drop(columns=categorical_cols).reset_index(drop=True)], axis=1)

# Applying the same transformation to the validation set
X_val_encoded = encoder.transform(X_val[categorical_cols])
X_val_combined = pd.concat([pd.DataFrame(X_val_encoded), X_val.drop(columns=categorical_cols).reset_index(drop=True)], axis=1)

# Defining the Logistic Regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Converting all column names to strings for compatibility with the model
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)

# Fitting the model on the training data again
model.fit(X_train_combined, y_train)

# Predicting on the validation data
y_val_pred = model.predict(X_val_combined)

# Calculating the accuracy score on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)

# Rounding the accuracy to 2 decimal places
val_accuracy_rounded = round(val_accuracy, 2)

val_accuracy_rounded





0.87

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', 'age', 'balance', 'day',
       'duration', 'campaign', 'pdays', 'previous'],
      dtype='object')

In [42]:
# Function to calculate accuracy without a specific feature
def calculate_accuracy_without_specific_feature(feature, X_train_combined, X_val_combined, y_train, y_val):
    # Dropping the specific feature from both train and validation sets
    X_train_reduced = X_train_combined.drop(columns=[feature])
    X_val_reduced = X_val_combined.drop(columns=[feature])
    
    # Training the model without the feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_reduced, y_train)
    
    # Predicting on the validation set
    y_val_pred = model.predict(X_val_reduced)
    
    # Calculating the accuracy on the validation set
    return accuracy_score(y_val, y_val_pred)

# Features to test for elimination
features_to_test = X_train_combined.columns

# Calculating accuracy difference for each feature
feature_differences_test = {}
for feature in features_to_test:
    accuracy_without_feature = calculate_accuracy_without_specific_feature(feature, X_train_combined, X_val_combined, y_train, y_val)
    difference = 1 - accuracy_without_feature
    feature_differences_test[feature] = round(difference, 4)

# Finding the feature with the smallest difference
least_useful_feature_test = min(feature_differences_test, key=feature_differences_test.get)

least_useful_feature_test, feature_differences_test[least_useful_feature_test]


('27', 0.1146)

In [31]:
# Получаем новые имена столбцов после кодирования
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Выводим новые имена столбцов
print(encoded_feature_names)


['job_1' 'job_2' 'job_3' 'job_4' 'job_5' 'job_6' 'job_7' 'job_8' 'job_9'
 'job_10' 'job_11' 'marital_1' 'marital_2' 'education_1' 'education_2'
 'education_3' 'housing_1' 'contact_1' 'contact_2' 'month_1' 'month_2'
 'month_3' 'month_4' 'month_5' 'month_6' 'month_7' 'month_8' 'month_9'
 'month_10' 'month_11' 'poutcome_1' 'poutcome_2' 'poutcome_3']


In [32]:
# List of regularization parameter values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store accuracy for each C value
accuracy_for_C = {}

# Loop over each value of C and train the model
for C in C_values:
    # Defining the Logistic Regression model with the current value of C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Fitting the model on the training data
    model.fit(X_train_combined, y_train)
    
    # Predicting on the validation data
    y_val_pred = model.predict(X_val_combined)
    
    # Calculating the accuracy score on the validation set
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Storing the accuracy rounded to 3 decimal digits
    accuracy_for_C[C] = round(val_accuracy, 3)

# Finding the best C value (the one with the highest accuracy)
best_C = max(accuracy_for_C, key=accuracy_for_C.get)

best_C, accuracy_for_C[best_C]


(1, 0.875)