In [195]:
%matplotlib inline

# Carga de Librerias

In [196]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import joblib

In [197]:
SEED = 42
TARGET = 'y'

# Lectura de Archivo

In [198]:
df = pd.read_csv('../Data/Input/bank-full.csv', sep=';')

In [199]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [200]:
df.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


# Información del dataset

- Aparentemente no hay valores nulos o vacios.
- Las variables categoricas se encuentran con tipo de dato object, por lo tanto es necesario transformarlo a tipo categoría.
- Se presume que el nombre de las variables es suficientemente descriptivo, por lo tanto no se lo cambiará.

In [201]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


# Descripcion de los valores del dataset


In [202]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,45211.0,40.93621,10.618762,18.0,33.0,39.0,48.0,95.0
balance,45211.0,1362.272058,3044.765829,-8019.0,72.0,448.0,1428.0,102127.0
day,45211.0,15.806419,8.322476,1.0,8.0,16.0,21.0,31.0
duration,45211.0,258.16308,257.527812,0.0,103.0,180.0,319.0,4918.0
campaign,45211.0,2.763841,3.098021,1.0,1.0,2.0,3.0,63.0
pdays,45211.0,40.197828,100.128746,-1.0,-1.0,-1.0,-1.0,871.0
previous,45211.0,0.580323,2.303441,0.0,0.0,0.0,0.0,275.0


In [203]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [204]:
def is_numeric(col_type):
    return col_type in ('int') or col_type in ('float')

def is_categorical(col_type):
    return col_type in ('category')

In [205]:
def to_categorical(cols, data):
    for col in cols:
        data[col] = data[col].astype('category').cat.codes

        
def impute_null(data):
    for col in data.columns:
        if(is_numeric(col)):
            data[col] = data[col].fillna(data[col].median())
        
        elif(is_categorical(col)):
            data[col] = data[col].fillna(data[col].value_counts()[0])

In [206]:
cat_cols = df.select_dtypes(include=['object']).copy().columns

to_categorical(cat_cols, df)
impute_null(data=df)

In [207]:
y = df['y']
X = df.drop(['y'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [208]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_hat = model.predict(X_test)
test_score = accuracy_score(y_test, y_hat)
print(f'Test: {test_score}')

Test: 0.9041554959785523


In [209]:
file = pickle.dumps(model)
filename = '../Model/model_rfc.sav'
joblib.dump(model, open(filename, 'wb'))

In [210]:
loaded_model = joblib.load(filename)
test_score = accuracy_score(y_test, y_hat)
print(f'Test Again: {test_score}')

Test Again: 0.9041554959785523


In [211]:
pickle.dumps?

[1;31mSignature:[0m [0mpickle[0m[1;33m.[0m[0mdumps[0m[1;33m([0m[0mobj[0m[1;33m,[0m [0mprotocol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mfix_imports[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mbuffer_callback[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the pickled representation of the object as a bytes object.

The optional *protocol* argument tells the pickler to use the given
protocol; supported protocols are 0, 1, 2, 3, 4 and 5.  The default
protocol is 4. It was introduced in Python 3.4, and is incompatible
with previous versions.

Specifying a negative protocol version selects the highest protocol
version supported.  The higher the protocol used, the more recent the
version of Python needed to read the pickle produced.

If *fix_imports* is True and *protocol* is less than 3, pickle will
try to map the new Python 3 names to the old module names used in
Python 2, so that the pickle