In [57]:
import pandas
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import pickle

NUMBER_OF_FEATURES = 8

In [54]:
url = "mexico_covid19.csv"
names = ["Gender","Neumonia","Age","Pregnant","Diabetes","Astma","ImmunoSuppressed","Hypertension","Cardiovascular","Obese","Smoking","PatientOutcome"]
dtypes = {'Gender': 'int', 'Neumonia': 'int', 'Age': 'int', 'Pregnant':'Int64', 'Diabetes':'int', 'Astma':'int', 
          'ImmunoSuppressed':'int', 'Hypertension': 'int', 'Cardiovascular': 'int', 'Obese': 'int', 
          'Smoking': 'int', 'PatientOutcome': 'int'}

data = pandas.read_csv(url, names=names, dtype=dtypes, header=1, keep_default_na=False, na_values=['NA'])


In [55]:
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(data[['Pregnant']])
data['Pregnant'] = imr.transform(data[['Pregnant']]).ravel()

In [58]:
y = data['PatientOutcome']
X = data.drop('PatientOutcome', axis=1)

from sklearn.feature_selection import SelectKBest, mutual_info_regression

selector = SelectKBest(mutual_info_regression, k = NUMBER_OF_FEATURES)
selector.fit(X, y)
columns = data.columns[np.append(selector.get_support(), True)]
columns

Index(['Neumonia', 'Age', 'Pregnant', 'Diabetes', 'Astma', 'ImmunoSuppressed',
       'Hypertension', 'Obese', 'PatientOutcome'],
      dtype='object')

In [59]:
array = data[columns].values

x = array[:, 0:NUMBER_OF_FEATURES]
y = array[:, NUMBER_OF_FEATURES]
test_size = 0.33
seed = 7

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=test_size, random_state=seed)

model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression()

In [60]:
filename = 'covid19_model_v1.sav'
pickle.dump(model, open(filename, 'wb'))