## Índice
1. [Importación de librerías](#1-importación-de-librerías)
2. [Carga del dataset](#2-carga-del-dataset)
3. [Dividir el conjunto de datos](#3-dividir-el-conjunto-de-datos)
4. [Codificar variables categóricas](#4-codificar-variables-categóricas)
5. [Estandarizar las características](#5-estandarizar-las-características)
6. [Reducir la dimensionalidad del conjunto de datos](#6-reducir-la-dimensionalidad-del-conjunto-de-datos)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Librerias

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Dataset

In [None]:
bank = pd.read_csv('/content/drive/MyDrive/bank_dataset.csv')

In [None]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [None]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59.0,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56.0,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41.0,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55.0,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54.0,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [None]:
# Ver los valores únicos de cada columna categórica
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome', 'deposit']

for column in categorical_columns:
    unique_values = bank[column].unique()
    print(f"Valores únicos en '{column}': {unique_values}\n")

Valores únicos en 'job': ['admin.' 'technician' 'services' 'management' 'retired' 'blue-collar'
 'unemployed' 'entrepreneur' 'housemaid' 'unknown' 'self-employed'
 'student']

Valores únicos en 'marital': ['married' 'single' 'divorced' nan]

Valores únicos en 'education': ['secondary' 'tertiary' 'primary' 'unknown' nan]

Valores únicos en 'default': ['no' 'yes']

Valores únicos en 'housing': ['yes' 'no']

Valores únicos en 'loan': ['no' 'yes']

Valores únicos en 'contact': ['unknown' 'cellular' 'telephone']

Valores únicos en 'poutcome': ['unknown' 'other' 'failure' 'success']

Valores únicos en 'deposit': ['yes' 'no']



In [78]:
age_mean = bank.groupby(['job', 'marital', 'education'])['age'].mean().reset_index(name='mean_age')
print(age_mean)
# Calcular la moda (la categoría más frecuente) de 'marital' para cada combinación
marital_mode = bank.groupby(['job', 'age', 'education'])['marital'].agg(lambda x: x.mode()[0] if not x.mode().empty else None).reset_index(name='mode_marital')
print(marital_mode)
# Calcular la moda de 'education' para cada combinación
education_mode = bank.groupby(['job', 'age', 'marital'])['education'].agg(lambda x: x.mode()[0] if not x.mode().empty else None).reset_index(name='mode_education')
print(education_mode)

         job   marital  education  mean_age
0     admin.  divorced    primary  0.106142
1     admin.  divorced  secondary  0.164997
2     admin.  divorced   tertiary -0.355484
3     admin.  divorced    unknown  0.735632
4     admin.   married    primary  0.707655
..       ...       ...        ...       ...
127  unknown   married   tertiary  0.685273
128  unknown   married    unknown  1.077356
129  unknown    single  secondary -0.621269
130  unknown    single   tertiary -0.250569
131  unknown    single    unknown -0.308856

[132 rows x 4 columns]
          job       age  education mode_marital
0      admin. -1.782329  secondary       single
1      admin. -1.698397  secondary       single
2      admin. -1.614465  secondary       single
3      admin. -1.614465   tertiary       single
4      admin. -1.530533  secondary       single
...       ...       ...        ...          ...
1370  unknown  1.574953    unknown      married
1371  unknown  1.994613    unknown      married
1372  unknown  2

In [79]:
# Imputar la media para 'age'
for index, row in bank[bank['age'].isnull()].iterrows():
    mean_age = age_mean.loc[
        (age_mean['job'] == row['job']) &
        (age_mean['marital'] == row['marital']) &
        (age_mean['education'] == row['education']) , 'mean_age'
    ]

    if not mean_age.empty:
        bank.at[index, 'age'] = mean_age.values[0]

# Imputar la moda para 'education'
for index, row in bank[bank['education'].isnull()].iterrows():
    mode_education = education_mode.loc[
        (education_mode['job'] == row['job']) &
        (education_mode['age'] == row['age']) &
        (education_mode['marital'] == row['marital']), 'mode_education'
    ]

    if not mode_education.empty:
        bank.at[index, 'education'] = mode_education.values[0]

# Imputar la moda para 'marital'
for index, row in bank[bank['marital'].isnull()].iterrows():
    mode_marital = marital_mode.loc[
        (marital_mode['job'] == row['job']) &
        (marital_mode['age'] == row['age']) &
        (marital_mode['education'] == row['education']), 'mode_marital'
    ]

    if not mode_marital.empty:
        bank.at[index, 'marital'] = mode_marital.values[0]


#Division del conjunto de datos

In [81]:
# Definir X e Y
X = bank.drop('deposit', axis=1)  # Todas las columnas excepto 'deposit'
Y = bank['deposit']  # La variable objetivo 'deposit'

# Dividir los datos en conjuntos de entrenamiento y prueba (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Verificar el tamaño de cada conjunto
print(f"Tamaño del conjunto de entrenamiento (X): {X_train.shape}")
print(f"Tamaño del conjunto de prueba (X): {X_test.shape}")
print(f"Tamaño del conjunto de entrenamiento (Y): {Y_train.shape}")
print(f"Tamaño del conjunto de prueba (Y): {Y_test.shape}")

Tamaño del conjunto de entrenamiento (X): (8929, 16)
Tamaño del conjunto de prueba (X): (2233, 16)
Tamaño del conjunto de entrenamiento (Y): (8929,)
Tamaño del conjunto de prueba (Y): (2233,)


#Codificación variables categoricas

In [82]:
#Columnas Binarias
binary_columns = ['deposit', 'default', 'housing', 'loan']

# Aplicar Label Encoding a las columnas binarias
label_encoder = LabelEncoder()

for column in binary_columns:
    bank[column] = label_encoder.fit_transform(bank[column])
    print(f"Valores únicos en '{column}' después de Label Encoding: {bank[column].unique()}\n")

# Ver el dataframe codificado
print(bank.head())

Valores únicos en 'deposit' después de Label Encoding: [1 0]

Valores únicos en 'default' después de Label Encoding: [0 1]

Valores únicos en 'housing' después de Label Encoding: [1 0]

Valores únicos en 'loan' después de Label Encoding: [0 1]

        age         job  marital  education  default   balance  housing  loan  \
0  1.491021      admin.  married  secondary        0  0.252525        1     0   
1  1.239224      admin.  married  secondary        0 -0.459974        0     0   
2 -0.019756  technician  married  secondary        0 -0.080160        1     0   
3  1.155292    services  married  secondary        0  0.293762        1     0   
4  1.071360      admin.  married   tertiary        0 -0.416876        0     0   

   contact  day month  duration  campaign     pdays  previous poutcome  \
0  unknown    5   may  1.930226 -0.554168 -0.481184         0  unknown   
1  unknown    5   may  3.154612 -0.554168 -0.481184         0  unknown   
2  unknown    5   may  2.929901 -0.554168 -0.4

In [83]:
#cat_features = ["job", "marital", "default", "housing", "loan"]

#data = pd.get_dummies(bank, columns = cat_features)
#data

# Initializing the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(bank[["poutcome", "marital", "education", "contact"]])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(["poutcome", "marital", "education", "contact"]))

# Concatenating the one-hot encoded columns to the original DataFrame
final_df = pd.concat([bank, one_hot_encoded_df], axis=1).drop(["poutcome", "marital", "education", "contact"], axis=1)

print("\nOne-hot encoded data:\n", final_df)


One-hot encoded data:
             age          job  default   balance  housing  loan  day month  \
0      1.491021       admin.        0  0.252525        1     0    5   may   
1      1.239224       admin.        0 -0.459974        0     0    5   may   
2     -0.019756   technician        0 -0.080160        1     0    5   may   
3      1.155292     services        0  0.293762        1     0    5   may   
4      1.071360       admin.        0 -0.416876        0     0    5   may   
...         ...          ...      ...       ...      ...   ...  ...   ...   
11157 -0.691213  blue-collar        0 -0.473616        1     0   20   apr   
11158 -0.187620     services        0 -0.246658        0     0   16   jun   
11159 -0.775145   technician        0 -0.464934        0     0   19   aug   
11160  0.148108   technician        0 -0.473926        0     1    8   may   
11161 -0.607281   technician        0 -0.473926        0     0    9   jul   

       duration  campaign  ...  marital_married  ma

In [84]:
final_df.columns

Index(['age', 'job', 'default', 'balance', 'housing', 'loan', 'day', 'month',
       'duration', 'campaign', 'pdays', 'previous', 'deposit',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_None', 'education_primary',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'contact_cellular', 'contact_telephone', 'contact_unknown'],
      dtype='object')

#Estandarizacion de datos

In [85]:
from sklearn.preprocessing import StandardScaler

# Seleccionar las columnas numéricas
numerical_columns = ['age', 'balance', 'duration', 'pdays', 'campaign']

# Aplicar StandardScaler solo a esas columnas
scaler = StandardScaler()
final_df[numerical_columns] = scaler.fit_transform(final_df[numerical_columns])

# Ver el resultado
print(final_df.head())


        age         job  default   balance  housing  loan  day month  \
0  1.491692      admin.        0  0.252525        1     0    5   may   
1  1.239803      admin.        0 -0.459974        0     0    5   may   
2 -0.019646  technician        0 -0.080160        1     0    5   may   
3  1.155839    services        0  0.293762        1     0    5   may   
4  1.071876      admin.        0 -0.416876        0     0    5   may   

   duration  campaign  ...  marital_married  marital_single  marital_None  \
0  1.930226 -0.554168  ...              1.0             0.0           0.0   
1  3.154612 -0.554168  ...              1.0             0.0           0.0   
2  2.929901 -0.554168  ...              1.0             0.0           0.0   
3  0.596366 -0.554168  ...              1.0             0.0           0.0   
4  0.867171 -0.186785  ...              1.0             0.0           0.0   

   education_primary  education_secondary  education_tertiary  \
0                0.0                  1

In [86]:
final_df.columns

Index(['age', 'job', 'default', 'balance', 'housing', 'loan', 'day', 'month',
       'duration', 'campaign', 'pdays', 'previous', 'deposit',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_None', 'education_primary',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'contact_cellular', 'contact_telephone', 'contact_unknown'],
      dtype='object')

#Reducción de dimensionalidad

In [91]:
from sklearn.decomposition import PCA

# Aplicar PCA para reducir a 2 componentes principales
pca = PCA(n_components=0.9)
pca_data = pca.fit_transform(final_df[numerical_columns])
pca_data.shape


(11162, 5)