In [58]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [59]:
df_bank = pd.read_csv('bank-full.csv', sep=';')

In [60]:
df_bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [61]:
df_bank.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [62]:
df_bank.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [63]:
df_bank.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Transformación de variables

### Categóricas

#### Pocas:

➡ damies ➡️ One hot coder

#### Muchas:

➡ Transformación en frecuencia (en vez del nombre de la categoría sale la frecuencia)
➡ Reagrupar las clases minoritarias residuales.
➡ Ordinal encoding ➡️ LabelEncoder() 

#### Veo cuántas variables tiene la columna Job

In [64]:
df_bank.job.value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

#### Le aplico Label Encoder

In [65]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le?

Es un objeto Label Encoder, hemos instanciado un objeto al lado del encoder. Podemos entrenar a este objeto.

Si le pongo un punto al lado y le doy al tab, me salen las cosas que puedo hacer con el objeto 

le.fit()

Si me meto en el paréntesis y le doy a shif + tab me dice que lo que va a tomar como valor es un array(una columna, una serie)
Queremos transformar la columna de Job, tengo que enseñarle lo es cada profesión de la columna. Veo cuántas categorías tiene la columna:

In [66]:
df_bank.job.unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

Le meto estos valores a la función Label Encoder

In [67]:
le.fit(df_bank.job.unique())

Ya hemos entrenado al objeto, sabe lo que hay dentro de la categoría job. Ahora le meto la variable entera para que me la ordene y le de un valor a cada. 
Estable valores desde 0 a n-1, los ordena alfabéticamente.

In [68]:
le.classes_

array(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'], dtype=object)

Los ha ordenado alfabéticamente

In [69]:
le.transform(df_bank.job)

'''Nos podríamos haber ahorrado separar el fit y el transform'''
#le.fit_transform(df_bank.job)

'Nos podríamos haber ahorrado separar el fit y el transform'

Ahora convierte la variable en números

df_bank.job = le.transform(df_bank.job)
df_bank.job.head()

No lo ejecuto para hacer el One Hot encoder

#### Le aplico One Hot Encoder a Marital

In [70]:
df_bank['marital']

0         married
1          single
2         married
3         married
4          single
           ...   
45206     married
45207    divorced
45208     married
45209     married
45210     married
Name: marital, Length: 45211, dtype: object

In [71]:
from sklearn.preprocessing import OneHotEncoder
#Instancio el objeto
enc = OneHotEncoder(handle_unknown='ignore')
prueba = enc.fit_transform(df_bank['marital'].to_numpy().reshape(-1,1))
enc.categories_
prueba.toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

#### Cuántas categorías hay en cada variable

In [72]:
for i in df_bank.columns:
    print(i)
    print(df_bank[i].value_counts())

age
32    2085
31    1996
33    1972
34    1930
35    1894
      ... 
93       2
90       2
95       2
88       2
94       1
Name: age, Length: 77, dtype: int64
job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64
marital
married     27214
single      12790
divorced     5207
Name: marital, dtype: int64
education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64
default
no     44396
yes      815
Name: default, dtype: int64
balance
 0        3514
 1         195
 2         156
 4         139
 3         134
          ... 
-381         1
 4617        1
 20584       1
 4358        1
 16353       1
Name: balance, Length: 7168, dtype: int64
housing
yes    25130
no     20081
Name: housing, dtype: int64
lo

No entiendo por qué lo hacer así de raro pudiendo hacerlo como aquí abajo

In [73]:
df_bank.nunique()

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64

#### Convertir las variables categóricas de forma masiva

In [75]:
lista_categorica = ['marital', 'education', 'contact', 'poutcome','month', 'job']

In [76]:
lista_binario = ['y', 'default', 'loan', 'housing']

En vez de hacerlo así raro a mano, voy a hacer un bucle. Esto está mal porque no todas las categóricas son realmente categóricas

lista_categorica = []
lista_binario = []
for i in df_bank.columns:
    if df_bank[i].nunique() <3:
        lista_binario.append(i)
    else:
        lista_categorica.append(i)
        
print(lista_categorica)
print(lista_binaºbrio)
        

In [77]:
for i in lista_categorica:
    df_dummie = pd.get_dummies(df_bank[i], prefix=i)
    df_bank = pd.concat([df_bank, df_dummie], axis=1)


Como Education y Contact tienen una categoría se llaman igual: 'unknown', lo solucionamos con prefix, el prefijo es el nombre de la variable

In [78]:
df_bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,1,0,0,0,0,0,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,1,0,0,0,0,0,0,0,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,...,0,0,0,0,0,0,0,1,0,0
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,...,0,0,0,1,0,0,0,0,0,0
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,...,0,0,0,1,0,0,0,0,0,0
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,...,0,0,0,0,0,0,0,0,0,0


Seguimos teniendo las categorías originales, las transformamos a numérico

In [79]:
for i in lista_categorica:
    le = LabelEncoder()
    df_bank[i]= le.fit_transform(df_bank[i])
df_bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,58,4,1,2,no,2143,yes,no,2,5,...,0,0,1,0,0,0,0,0,0,0
1,44,9,2,1,no,29,yes,no,2,5,...,0,0,0,0,0,0,0,1,0,0
2,33,2,1,1,no,2,yes,yes,2,5,...,1,0,0,0,0,0,0,0,0,0
3,47,1,1,3,no,1506,yes,no,2,5,...,0,0,0,0,0,0,0,0,0,0
4,33,11,2,3,no,1,no,no,2,5,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,no,825,no,no,0,17,...,0,0,0,0,0,0,0,1,0,0
45207,71,5,0,0,no,1729,no,no,0,17,...,0,0,0,1,0,0,0,0,0,0
45208,72,5,1,1,no,5715,no,no,0,17,...,0,0,0,1,0,0,0,0,0,0
45209,57,1,1,1,no,668,no,no,1,17,...,0,0,0,0,0,0,0,0,0,0


Transformamos las variables binarias

In [80]:
for i in lista_binario:
    df_bank[i]= df_bank[i].replace({'no':0, 'yes':1})
df_bank.dtypes

age                    int64
job                    int32
marital                int32
education              int32
default                int64
balance                int64
housing                int64
loan                   int64
contact                int32
day                    int64
month                  int32
duration               int64
campaign               int64
pdays                  int64
previous               int64
poutcome               int32
y                      int64
marital_divorced       uint8
marital_married        uint8
marital_single         uint8
education_primary      uint8
education_secondary    uint8
education_tertiary     uint8
education_unknown      uint8
contact_cellular       uint8
contact_telephone      uint8
contact_unknown        uint8
poutcome_failure       uint8
poutcome_other         uint8
poutcome_success       uint8
poutcome_unknown       uint8
month_apr              uint8
month_aug              uint8
month_dec              uint8
month_feb     

# Partición en Training y Test

In [81]:
X_train, X_test, y_train, y_test = train_test_split(
    df_bank.drop('y', axis=1), df_bank['y'], test_size=0.2, random_state=42, stratify= df_bank['y'])

In [86]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(36168, 54)
(36168,)
(9043, 54)
(9043,)


# Entrenamos

In [87]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

# Comprobación

In [88]:
y_pred = clf.predict(X_test)

Esta es nuestra y de predicción

In [89]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

Compraramos la predicción con la y real

In [90]:
y_test

1392     0
7518     0
12007    0
5536     0
29816    0
        ..
12636    0
13364    0
16102    0
42097    0
31033    0
Name: y, Length: 9043, dtype: int64

Probabilidad de predicción, el umbral de corte es un 0.5 por defecto

In [91]:
clf.predict_proba(X_test)

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       ...,
       [0.96, 0.04],
       [0.87, 0.13],
       [0.99, 0.01]])

Nos quedamos con la probabilidad de que sea un 1

In [92]:
pd.DataFrame(clf.predict_proba(X_test), columns=['cero','uno'])['uno'].sort_values(ascending=False)

2399    0.96
5545    0.94
4432    0.94
7422    0.93
348     0.92
        ... 
6689    0.00
3406    0.00
6691    0.00
3398    0.00
0       0.00
Name: uno, Length: 9043, dtype: float64