### 1. Importar les llibreries

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import pickle
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

### 2. Carregar el dataset

In [46]:
df = pd.read_csv('/Users/nicolakorff/Desktop/ML/IT_Academy/bank_dataset_wins.CSV')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59.0,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56.0,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41.0,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55.0,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54.0,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


### 3. Divisió del dataset

In [47]:
# para modelos de clasificación, antes de dividir el conjunto, hay que analizar el balance entre las clases
# Dividir el dataset en conjuntos de entrenamiento y prueba
train, test = train_test_split(df, test_size=0.2, random_state=22)
print("Tamaño del conjunto de entrenamiento (train):", train.shape)
print("Tamaño del conjunto de prueba (test):", test.shape)

Tamaño del conjunto de entrenamiento (train): (8929, 17)
Tamaño del conjunto de prueba (test): (2233, 17)


In [48]:
# Guardar los conjuntos en archivos CSV
train.to_csv('bank_dataset_train.csv', index=False)
test.to_csv('bank_dataset_test.csv', index=False)
# Chequear las primeras filas de ambos conjuntos
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
843,44.0,technician,married,secondary,no,267,no,no,cellular,22,aug,520,2,-1,0,unknown,yes
1899,47.0,admin.,married,secondary,no,663,yes,yes,cellular,12,may,409,1,274,6,other,yes
7459,56.0,retired,married,tertiary,no,5769,no,no,unknown,20,jun,15,3,-1,0,unknown,no
2346,54.0,management,married,tertiary,no,1464,no,no,cellular,30,jun,157,1,-1,0,unknown,yes
5673,40.0,admin.,single,unknown,no,355,yes,no,cellular,7,apr,345,2,138,6,other,no


In [49]:
test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
1904,37.0,admin.,single,primary,no,912,yes,no,cellular,12,may,637,3,328,22,other,yes
3113,52.0,services,married,secondary,no,659,no,no,cellular,28,jan,390,4,93,1,success,yes
1281,71.0,retired,married,tertiary,no,653,no,no,telephone,26,feb,367,1,-1,0,unknown,yes
7856,29.0,blue-collar,married,primary,no,59,yes,yes,cellular,29,jul,665,4,-1,0,unknown,no
6757,28.0,blue-collar,married,primary,no,278,yes,yes,telephone,15,may,31,9,373,4,other,no


### 4. Transformar variables categóriques

In [50]:
# Label Encoding para 'education', 'month', 'poutcome' 
# Ver los valores únicos en la columna 'month'
unique_month = train['month'].unique()
print(unique_month)

['aug' 'may' 'jun' 'apr' 'jul' 'nov' 'jan' 'feb' 'mar' 'oct' 'sep' 'dec']


In [51]:
# Uso de OrdinalEncoder
train2=train
# Orden específico
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
# Configuramos el OrdinalEncoder con el orden deseado
ordinal_encoder = OrdinalEncoder(categories=[month_order])
# Ajustamos y transformamos la columna
train['month_encoded'] = ordinal_encoder.fit_transform(train[['month']])
train2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,month_encoded
843,44.0,technician,married,secondary,no,267,no,no,cellular,22,aug,520,2,-1,0,unknown,yes,7.0
1899,47.0,admin.,married,secondary,no,663,yes,yes,cellular,12,may,409,1,274,6,other,yes,4.0
7459,56.0,retired,married,tertiary,no,5769,no,no,unknown,20,jun,15,3,-1,0,unknown,no,5.0
2346,54.0,management,married,tertiary,no,1464,no,no,cellular,30,jun,157,1,-1,0,unknown,yes,5.0
5673,40.0,admin.,single,unknown,no,355,yes,no,cellular,7,apr,345,2,138,6,other,no,3.0


In [52]:
# Ver los valores únicos en la columna 'education'
unique_education = train2['education'].unique()
print(unique_education)

['secondary' 'tertiary' 'unknown' 'primary' nan]


In [53]:
# Uso de mapeo manual
# Especificamos el orden que queremos con un diccionario
education_map = {'primary': 1, 'secondary': 2, 'tertiary': 3, 'unknown': 0}

# Aplicamos el mapeo a la columna
train2['education_encoded'] = train2['education'].map(education_map)
train2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,month_encoded,education_encoded
843,44.0,technician,married,secondary,no,267,no,no,cellular,22,aug,520,2,-1,0,unknown,yes,7.0,2.0
1899,47.0,admin.,married,secondary,no,663,yes,yes,cellular,12,may,409,1,274,6,other,yes,4.0,2.0
7459,56.0,retired,married,tertiary,no,5769,no,no,unknown,20,jun,15,3,-1,0,unknown,no,5.0,3.0
2346,54.0,management,married,tertiary,no,1464,no,no,cellular,30,jun,157,1,-1,0,unknown,yes,5.0,3.0
5673,40.0,admin.,single,unknown,no,355,yes,no,cellular,7,apr,345,2,138,6,other,no,3.0,0.0


In [54]:
# Ver los valores únicos en la columna 'poutcome'
unique_poutcome = train2['poutcome'].unique()
print(unique_poutcome)

['unknown' 'other' 'success' 'failure']


In [55]:
# Uso de mapeo manual
# Especificamos el orden que queremos con un diccionario
poutcome_map = {'success': 3, 'failure': 2, 'other': 1, 'unknown': 0}

# Aplicamos el mapeo a la columna
train2['poutcome_encoded'] = train2['poutcome'].map(education_map)
train2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,month_encoded,education_encoded,poutcome_encoded
843,44.0,technician,married,secondary,no,267,no,no,cellular,22,aug,520,2,-1,0,unknown,yes,7.0,2.0,0.0
1899,47.0,admin.,married,secondary,no,663,yes,yes,cellular,12,may,409,1,274,6,other,yes,4.0,2.0,
7459,56.0,retired,married,tertiary,no,5769,no,no,unknown,20,jun,15,3,-1,0,unknown,no,5.0,3.0,0.0
2346,54.0,management,married,tertiary,no,1464,no,no,cellular,30,jun,157,1,-1,0,unknown,yes,5.0,3.0,0.0
5673,40.0,admin.,single,unknown,no,355,yes,no,cellular,7,apr,345,2,138,6,other,no,3.0,0.0,


In [56]:
# One-Hot Encoding para 'loan', 'housing', 'deafult', 'deposit', 'marital', 'job'
# Nuevo DataFrame
train3 = train2

In [57]:
# Utilitzar get_dummies per a 'loan', 'housing', 'deafult', 'deposit'
train3 = pd.get_dummies(train3, columns=['loan'], prefix='loan')
train3 = pd.get_dummies(train3, columns=['housing'], prefix='housing')
train3 = pd.get_dummies(train3, columns=['default'], prefix='default')
train3 = pd.get_dummies(train3, columns=['deposit'], prefix='deposit')
train3.head()

Unnamed: 0,age,job,marital,education,balance,contact,day,month,duration,campaign,...,education_encoded,poutcome_encoded,loan_no,loan_yes,housing_no,housing_yes,default_no,default_yes,deposit_no,deposit_yes
843,44.0,technician,married,secondary,267,cellular,22,aug,520,2,...,2.0,0.0,True,False,True,False,True,False,False,True
1899,47.0,admin.,married,secondary,663,cellular,12,may,409,1,...,2.0,,False,True,False,True,True,False,False,True
7459,56.0,retired,married,tertiary,5769,unknown,20,jun,15,3,...,3.0,0.0,True,False,True,False,True,False,True,False
2346,54.0,management,married,tertiary,1464,cellular,30,jun,157,1,...,3.0,0.0,True,False,True,False,True,False,False,True
5673,40.0,admin.,single,unknown,355,cellular,7,apr,345,2,...,0.0,,True,False,False,True,True,False,True,False


In [71]:
train4=train3
# Inicializamos el OneHotEncoder
encoder = OneHotEncoder(drop='first')  # 'drop="first"' elimina una categoría para evitar multicolinealidad

# Aplicamos el encoder a la columna 'marital' y lo convertimos en un array
marital_encoded = encoder.fit_transform(train4[['marital']]).toarray()

# Creamos un DataFrame con las columnas codificadas y las unimos al DataFrame original
marital_encoded_train4 = pd.DataFrame(marital_encoded, columns=encoder.get_feature_names_out(['marital']))
train4 = pd.concat([train4, marital_encoded_train4], axis=1)

In [72]:
# Inicializamos el OneHotEncoder
encoder = OneHotEncoder(drop='first')  # 'drop="first"' elimina una categoría para evitar multicolinealidad

# Aplicamos el encoder a la columna 'marital' y lo convertimos en un array
job_encoded = encoder.fit_transform(train4[['job']]).toarray()

# Creamos un DataFrame con las columnas codificadas y las unimos al DataFrame original
job_encoded_train4 = pd.DataFrame(job_encoded, columns=encoder.get_feature_names_out(['job']))
train4 = pd.concat([train4, job_encoded_train4], axis=1)

In [74]:
train4.head()

Unnamed: 0,age,job,marital,education,balance,contact,day,month,duration,campaign,...,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,job_nan
843,44.0,technician,married,secondary,267.0,cellular,22.0,aug,520.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1899,47.0,admin.,married,secondary,663.0,cellular,12.0,may,409.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7459,56.0,retired,married,tertiary,5769.0,unknown,20.0,jun,15.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2346,54.0,management,married,tertiary,1464.0,cellular,30.0,jun,157.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5673,40.0,admin.,single,unknown,355.0,cellular,7.0,apr,345.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### 5. Estandarització o Normalització de les dades

In [None]:
import pandas
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
# Inicializamos el StandardScaler
scale = StandardScaler()
# Seleccionamos las columnas
X = df[['Weight', 'Volume']]

scaledX = scale.fit_transform(X)

print(scaledX)

### 6. Reduir la dimensionalitat