# Machine Learning in Production

## Persisting Objects in Disc

In [2]:
import pickle
import joblib
import numpy as np
import _pickle as cPickle

### Creation of object from the top of thr list

In [4]:
lista = [1500,1234,567,14]

### File creation in disc

In [5]:
arquivo = open('lista.pkl','wb')

### Persisting object in disc

In [7]:
pickle.dump(lista,arquivo)

In [8]:
arquivo.close()

In [9]:
!ls

'ls' nÆo ‚ reconhecido como um comando interno
ou externo, um programa oper vel ou um arquivo em lotes.


### Loading the object from a disc to memory

In [10]:
arquivo2 = open('lista.pkl','rb')

In [11]:
lista2 = pickle.load(arquivo2)

In [12]:
arquivo2.close()

In [13]:
lista2

[1500, 1234, 567, 14]

## Persisting objects using Numpy

### Definning the array

In [14]:
x = np.arange(10)

In [15]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### Persisting array in disc

In [16]:
np.save('array',x)

### Listing files in disc

In [17]:
!ls

'ls' nÆo ‚ reconhecido como um comando interno
ou externo, um programa oper vel ou um arquivo em lotes.


### Loading the object from disc to the memory

In [18]:
x2 = np.load('array.npy')

In [19]:
x2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### Persisting the list previously created in disc

In [21]:
np.save('lista2.npy',lista2)

### Loading the object from the disc to the memory

In [22]:
lista3 = np.load('lista2.npy')

In [23]:
lista3

array([1500, 1234,  567,   14])

### Verifying the type of object created

In [24]:
type(lista3)

numpy.ndarray

## Persisting objects using Joblib

### Persisting the created list previously in disc

In [25]:
joblib.dump(lista3,'lista3.joblib')

['lista3.joblib']

### Persisting created list previously in disc using compression

In [27]:
joblib.dump(lista3,'lista3.gz',compress=True)

['lista3.gz']

### Loading the object from the disc to memory

In [31]:
lista4 = joblib.load('lista3.gz')

In [32]:
lista4

array([1500, 1234,  567,   14])

# Persisting the Machine Learnin model to disc

In [39]:
# Libs

import os
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Carregando a base de dados

In [42]:
data = pd.read_csv(r"C:\Users\marcf\OneDrive\Documentos\Ciencia de dados\Machine-Learning-em-producao\notebook-dataset\loan.csv")

In [43]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [45]:
data.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [46]:
data2 = data[data.Loan_Status=='Y'].sample(200)

In [47]:
data = data2.append(data[data.Loan_Status=='N'].sample(192))

In [48]:
data.Loan_Status.value_counts()

Y    200
N    192
Name: Loan_Status, dtype: int64

### Checking missing values

In [50]:
data2.isnull().sum()

Loan_ID               0
Gender                4
Married               1
Dependents            5
Education             0
Self_Employed        11
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            2
Loan_Amount_Term      5
Credit_History       23
Property_Area         0
Loan_Status           0
dtype: int64

### Preechendo Missing Values:

- `Dependents`: Assumindo o valor majoritário da coluna.
- `Self_Employed`: Assumindo o valor majoritário da coluna.
- `Loan_Amount_Term`: Preenchendo com o valor médio da coluna.
- `Credit_History`: Assumindo o valor majoritário da coluna.
- `Married`: Assumindo o valor majoritário da coluna.
- `Gender`: Assumindo o valor majoritário da coluna.

In [52]:
data['Gender'] = data['Gender'].fillna('Male')

In [53]:
data['Married'] = data['Married'].fillna('No')

In [54]:
data['Dependents'] = data['Dependents'].fillna('0')

In [55]:
data['Self_Employed'] = data['Self_Employed'].fillna('No')

In [56]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())

In [57]:
data['Credit_History'] = data['Credit_History'].fillna(1.0)

In [58]:
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

In [59]:
data.Credit_History.value_counts()

1.0    309
0.0     83
Name: Credit_History, dtype: int64

### Checking missing values again

In [60]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Trnasforming categoric data

#### Várias colunas do dataframe são categóricas, precisamos transforma-las, são elas: `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [62]:
from sklearn.preprocessing import LabelEncoder

In [63]:
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
loan_values = {'Y':1,'N':0}
data.replace({'Gender': gender_values,
                 'Married': married_values, 
                 'Education': education_values,
                 'Self_Employed': employed_values, 
                 'Dependents': dependent_values,
                 'Loan_Status': loan_values
                }, inplace=True)

In [64]:
data.drop(['Loan_ID','CoapplicantIncome','Loan_Amount_Term','Credit_History','Property_Area'],axis=1,inplace=True)

In [65]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Status
611,1,1,1,0,0,8072,253.0,1
198,0,1,0,0,0,3416,113.0,1
159,1,1,0,0,0,4583,255.0,1
525,1,1,2,0,1,17500,400.0,1
335,1,1,0,0,1,5503,70.0,1


In [67]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [68]:
pipe_random_forest = Pipeline([
        ('scl', StandardScaler()),
        
        ('clf', RandomForestClassifier())
        ]
)

In [69]:
pipe_svm = Pipeline([
        ('scl', StandardScaler()),
        ('clf', svm.SVC())
        ]
)

In [70]:
pipe_knn = Pipeline([
        ('scl', StandardScaler()),
        ('clf', KNeighborsClassifier())
        ]
)

### Grid Values

In [71]:
valores = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [72]:
grid_params_rf = [{
    'clf__criterion': ['gini', 'entropy'],
    'clf__min_samples_leaf': valores,
    'clf__max_depth': valores,
    'clf__min_samples_split': valores[1:]
}]

In [73]:
grid_params_svm = [{
    'clf__kernel': ['linear', 'rbf'], 
    'clf__C': valores
}]

In [74]:
grid_params_knn = [{
    'clf__n_neighbors': valores,
}]

### Building GridSearch

In [75]:
gs_rf = GridSearchCV(
    estimator=pipe_random_forest,
    param_grid=grid_params_rf,
    scoring='accuracy',
    cv=10 
)

In [76]:
gs_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=grid_params_svm,
    scoring='accuracy',
    cv=10,
)

In [77]:
gs_knn = GridSearchCV(
    estimator=pipe_knn,
    param_grid=grid_params_knn,
    scoring='accuracy',
    cv=10,
)

In [78]:
X_train = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

#### Assigning GridSearch to Random Forest

In [79]:
gs_rf.fit(X_train,y)

##### Best parameters and scoring

In [82]:
print('Melhores parâmetros: %s' % gs_rf.best_params_)
print('Melhores acurácia: %.3f' % gs_rf.best_score_)

Melhores parâmetros: {'clf__criterion': 'gini', 'clf__max_depth': 1, 'clf__min_samples_leaf': 8, 'clf__min_samples_split': 8}
Melhores acurácia: 0.571


#### Assigning GridSearch to SVM

In [83]:
gs_svm.fit(X_train,y)

##### Best parameters and scoring

In [84]:
print('Melhores parâmetros: %s' % gs_svm.best_params_)
print('Melhores acurácia: %.3f' % gs_svm.best_score_)

Melhores parâmetros: {'clf__C': 1, 'clf__kernel': 'rbf'}
Melhores acurácia: 0.497


### Assigning GridSearch to KNN

In [85]:
gs_knn.fit(X_train,y)

##### Best parameters and scoring

In [86]:
print('Melhores parâmetros: %s' % gs_knn.best_params_)
print('Melhores acurácia: %.3f' % gs_knn.best_score_)

Melhores parâmetros: {'clf__n_neighbors': 9}
Melhores acurácia: 0.502


### Validation Metrics

In [87]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [88]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_train, y)

In [89]:
print(pd.crosstab(y_teste, gs_rf.predict(X_teste), rownames=['Real'], colnames=['Predito'], margins=True), '')

Predito   0   1  All
Real                
0        16  25   41
1        12  45   57
All      28  70   98 


In [90]:
print (metrics.classification_report(y_teste,gs_rf.predict(X_teste)))

              precision    recall  f1-score   support

           0       0.57      0.39      0.46        41
           1       0.64      0.79      0.71        57

    accuracy                           0.62        98
   macro avg       0.61      0.59      0.59        98
weighted avg       0.61      0.62      0.61        98



## Persisting the best model in disc

In [91]:
joblib.dump(gs_rf,'model.pkl')

['model.pkl']

## Loading the model from the disc to memory

In [92]:
model = joblib.load('model.pkl')