In [None]:
import os
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount('/content/drive')

# Directorio donde se almacenarán los archivos
directory = '/content/drive/My Drive/data/processed'

# Crear directorio si no existe
if not os.path.exists(directory):
    os.makedirs(directory)

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv", sep = ";")
total_data.head()
print(total_data.columns)

Mounted at /content/drive
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')


In [None]:
total_data = total_data.drop_duplicates().reset_index(drop = True)
total_data.head()

In [None]:
total_data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [None]:
# Min-Max scaler

from sklearn.preprocessing import MinMaxScaler

#Añadimos las columnas con los valores factorizados al final del df

total_data["job_n"] = pd.factorize(total_data["job"])[0]
total_data["marital_n"] = pd.factorize(total_data["marital"])[0]
total_data["education_n"] = pd.factorize(total_data["education"])[0]
total_data["default_n"] = pd.factorize(total_data["default"])[0]
total_data["housing_n"] = pd.factorize(total_data["housing"])[0]
total_data["loan_n"] = pd.factorize(total_data["loan"])[0]
total_data["contact_n"] = pd.factorize(total_data["contact"])[0]
total_data["month_n"] = pd.factorize(total_data["month"])[0]
total_data["day_of_week_n"] = pd.factorize(total_data["day_of_week"])[0]
total_data["poutcome_n"] = pd.factorize(total_data["poutcome"])[0]
total_data["y_n"] = pd.factorize(total_data["y"])[0]

#Compruebo que se han añadido
print(total_data.columns)

#Me creo un nuevo array que contenga las nuevas cabeceras
num_variables = ["job_n", "marital_n", "education_n", "default_n", "housing_n", "loan_n", "contact_n", "month_n", "day_of_week_n", "poutcome_n",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_n"]

scaler = MinMaxScaler()

#Me creo el df scal_features
scal_features = scaler.fit_transform(total_data[num_variables])

#Este es un df con scal_features al que le añado las columnas del df original (total_data)
total_data_scal = pd.DataFrame(scal_features, index = total_data.index, columns = num_variables)
total_data_scal.head()

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'job_n', 'marital_n',
       'education_n', 'default_n', 'housing_n', 'loan_n', 'contact_n',
       'month_n', 'day_of_week_n', 'poutcome_n', 'y_n'],
      dtype='object')


Unnamed: 0,job_n,marital_n,education_n,default_n,housing_n,loan_n,contact_n,month_n,day_of_week_n,poutcome_n,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y_n
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.090909,0.0,0.142857,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.090909,0.0,0.142857,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.181818,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.090909,0.0,0.142857,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [None]:
# Selección de características

X = total_data_scal.drop("y_n", axis = 1)
y = total_data_scal["y_n"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

selection_model = SelectKBest(chi2, k = 5)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()


Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed
0,0.0,0.0,1.0,0.98073,1.0
1,0.0,0.0,0.333333,0.138291,0.512287
2,0.0,0.0,0.9375,0.956926,0.859735
3,0.0,0.0,0.9375,0.957379,0.859735
4,0.0,0.0,0.333333,0.175924,0.512287


In [None]:
X_test_sel.head()

Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed
0,0.0,0.0,0.104167,0.143278,0.425709
1,0.5,0.142857,0.333333,0.150759,0.512287
2,0.0,0.0,0.479167,0.073679,0.0
3,0.5,0.142857,0.333333,0.150759,0.512287
4,0.0,0.0,1.0,0.98141,1.0


In [None]:
X_train_sel["y_n"] = list(y_train)
X_test_sel["y_n"] = list(y_test)

# Guardar en Google Drive
X_train_sel.to_csv("/content/drive/My Drive/data/processed/clean_train.csv", index=False)
X_test_sel.to_csv("/content/drive/My Drive/data/processed/clean_test.csv", index=False)

In [None]:
train_data = pd.read_csv("/content/drive/My Drive/data/processed/clean_train.csv")
test_data = pd.read_csv("/content/drive/My Drive/data/processed/clean_train.csv")

train_data.head()

Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed,y_n
0,0.0,0.0,1.0,0.98073,1.0,0.0
1,0.0,0.0,0.333333,0.138291,0.512287,0.0
2,0.0,0.0,0.9375,0.956926,0.859735,0.0
3,0.0,0.0,0.9375,0.957379,0.859735,0.0
4,0.0,0.0,0.333333,0.175924,0.512287,0.0


In [None]:
X_train = train_data.drop(["y_n"], axis = 1)
y_train = train_data["y_n"]
X_test = test_data.drop(["y_n"], axis = 1)
y_test = test_data["y_n"]

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8951373522586216

In [None]:
from sklearn.model_selection import GridSearchCV

hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
model = LogisticRegression(C = 0.1, penalty = "l2", solver = "liblinear")
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
accuracy_score(y_test, y_pred)

0.8969453289438178

In [None]:

from pickle import dump

dump(model, open("/content/drive/My Drive/data/processed/logistic_regression_C-0.1_penalty-l2_solver-liblinear_42.sav", "wb"))