# Predictive model

Juan Pablo Echeagaray González

24/10/2021

## Logistic regression

### Naïve logistic regression 

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from zipfile import ZipFile

In [4]:
with ZipFile('data\\clean_data.zip') as zip:
    with zip.open(zip.namelist()[0]) as data:
        df = pd.read_pickle(data)

In [6]:
df.head()

Unnamed: 0,FECHA_ACTUALIZACION,ID_REGISTRO,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,...,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,TOMA_MUESTRA_LAB,RESULTADO_LAB,TOMA_MUESTRA_ANTIGENO,RESULTADO_ANTIGENO,CLASIFICACION_FINAL,MIGRANTE,UCI
1,2021-10-23,z49a69,1,12,23,1,23,23,4,2,...,0,0,1,2,97,2,97,2,99,1
16,2021-10-23,z166d5,1,12,1,1,1,1,1,2,...,0,0,2,1,1,2,97,3,99,2
21,2021-10-23,z388cd,1,12,29,0,15,21,117,2,...,0,0,1,2,97,2,97,1,99,2
22,2021-10-23,z4533d,1,12,29,1,21,21,117,2,...,0,0,2,2,97,2,97,6,99,2
27,2021-10-23,z4494e,2,12,8,1,8,8,37,2,...,0,0,1,1,1,2,97,3,99,2


In [7]:
variables = ['INTUBADO', 'SEXO', 'EDAD', 'NEUMONIA', 'EPOC', 'TABAQUISMO', 'ASMA', 'OBESIDAD', 'DIABETES', 'HIPERTENSION',
            'CARDIOVASCULAR', 'EMBARAZO', 'INMUSUPR', 'RENAL_CRONICA']
variables.remove('INTUBADO')
response = 'INTUBADO'

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[variables], df[response], test_size=0.3, random_state=42)

In [9]:
model = LogisticRegression(max_iter=1000, class_weight='auto')
model.fit(X_train, y_train)

LogisticRegression(class_weight='auto', max_iter=1000)

In [11]:
predictions = model.predict(X_test)

In [15]:
coef = model.coef_[0]
for i, v in enumerate(variables):
    print(f'{v:<10} {coef[i]:.5f}')

SEXO       -0.26612
EDAD       0.29696
NEUMONIA   1.43352
EPOC       -0.16672
TABAQUISMO -0.02945
ASMA       -0.02886
OBESIDAD   0.33276
DIABETES   0.06015
HIPERTENSION 0.08468
CARDIOVASCULAR 0.03056
EMBARAZO   -0.44855
INMUSUPR   0.16394
RENAL_CRONICA -0.14334


In [17]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.5f} %')

Accuracy: 89.70302 %


A very naïve approach that generates a model with a precision of nearly 90%, I still want to improve this value.

### Logistic regression with artificial data

In [18]:
from imblearn.over_sampling import SMOTE

In [19]:
variables = ['INTUBADO', 'SEXO', 'EDAD', 'NEUMONIA', 'EPOC', 'TABAQUISMO', 'ASMA', 'OBESIDAD', 'DIABETES', 'HIPERTENSION',
            'CARDIOVASCULAR', 'EMBARAZO', 'INMUSUPR', 'RENAL_CRONICA']
variables.remove('INTUBADO')
response = 'INTUBADO'
X_train, X_test, y_train, y_test = train_test_split(df[variables], df[response], test_size=0.3, random_state=42)

In [20]:
os = SMOTE(random_state=42)
X_train, y_train = os.fit_resample(X_train, y_train)

In [21]:
model_artificial = LogisticRegression(max_iter=1000, class_weight='auto')
model_artificial.fit(X_train, y_train)

LogisticRegression(class_weight='auto', max_iter=1000)

In [22]:
predictions_artificial = model_artificial.predict(X_test)

In [23]:
coef = model_artificial.coef_[0]
for i, v in enumerate(variables):
    print(f'{v:<10} {coef[i]:.5f}')

SEXO       -0.27782
EDAD       0.33420
NEUMONIA   1.43192
EPOC       -0.18461
TABAQUISMO -0.05952
ASMA       -0.11193
OBESIDAD   0.33566
DIABETES   0.06531
HIPERTENSION 0.09745
CARDIOVASCULAR 0.02570
EMBARAZO   -0.56561
INMUSUPR   0.11969
RENAL_CRONICA -0.14332


In [24]:
accuracy = accuracy_score(y_test, predictions_artificial)
print(f'Accuracy: {accuracy * 100:.5f} %')

Accuracy: 52.50436 %


### Logistic regression with hyperparameters

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
model_hyp = LogisticRegression()

In [26]:
X_train, X_test, y_train, y_test = train_test_split(df[variables], df[response], test_size=0.3, random_state=42)

In [31]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000], 
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'max_iter': [100, 1000, 2500, 5000]}


In [32]:
clf = GridSearchCV(model_hyp, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [33]:
best_clf = clf.fit(X_train, y_train)

In [35]:
best_clf.best_estimator_

LogisticRegression(C=0.01, penalty='l1', solver='liblinear')

In [36]:
predictions = best_clf.predict(X_test)

In [37]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.5f} %')

Accuracy: 89.70302 %


In [45]:
coef = best_clf.best_estimator_.coef_[0]
for i, v in enumerate(variables):
    print(f'{v:<10} {coef[i]:.5f}')

SEXO       -0.26409
EDAD       0.23299
NEUMONIA   1.42217
EPOC       -0.11165
TABAQUISMO -0.00766
ASMA       0.00000
OBESIDAD   0.32177
DIABETES   0.05611
HIPERTENSION 0.08494
CARDIOVASCULAR 0.00000
EMBARAZO   -0.18057
INMUSUPR   0.09473
RENAL_CRONICA -0.10664


Pretty similar coefficients, and the same precision. Yet they fail in a lot of cases

#### Results

In [47]:
positives = np.count_nonzero(y_test == 1)
model_positives = np.count_nonzero(predictions == 1)
print(f'Positives: {positives}')
print(f'Model positives: {model_positives}')

Positives: 30574
Model positives: 0


### Logistic regression with hyperparameters and artificial data

In [48]:
model_frankenstein = LogisticRegression()

In [49]:
variables = ['INTUBADO', 'SEXO', 'EDAD', 'NEUMONIA', 'EPOC', 'TABAQUISMO', 'ASMA', 'OBESIDAD', 'DIABETES', 'HIPERTENSION',
            'CARDIOVASCULAR', 'EMBARAZO', 'INMUSUPR', 'RENAL_CRONICA']
variables.remove('INTUBADO')
response = 'INTUBADO'
X_train, X_test, y_train, y_test = train_test_split(df[variables], df[response], test_size=0.3, random_state=42)

In [50]:
os = SMOTE(random_state=42)
X_train, y_train = os.fit_resample(X_train, y_train)

In [51]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000], 
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'max_iter': [100, 1000, 2500, 5000]}

In [52]:
clf = GridSearchCV(model_hyp, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [53]:
best_clf = clf.fit(X_train, y_train)

In [54]:
best_clf.best_estimator_

LogisticRegression(C=0.01, max_iter=1000, penalty='l1', solver='liblinear')

In [55]:
predictions = best_clf.predict(X_test)

In [56]:
coef = best_clf.best_estimator_.coef_[0]
for i, v in enumerate(variables):
    print(f'{v:<10} {coef[i]:.5f}')

SEXO       -0.27741
EDAD       0.32068
NEUMONIA   1.42999
EPOC       -0.17251
TABAQUISMO -0.05428
ASMA       -0.09395
OBESIDAD   0.33293
DIABETES   0.06445
HIPERTENSION 0.09742
CARDIOVASCULAR 0.01791
EMBARAZO   -0.51677
INMUSUPR   0.10375
RENAL_CRONICA -0.13514


In [57]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.5f} %')

Accuracy: 52.46934 %


I don't really think I should say something. I will have to keep trying