In [1]:
import pandas as pd
import numpy as np

### Brief Look at Data

In [2]:
data = pd.read_csv('covid_data.csv')
data.head()

# 1-> Above 50%, 0-> Below 50% (Infection Probability)
# -1-> Little, 0-> Mediocre, 1-> Severe

Unnamed: 0,age,throat_pain,breathing_problem,body_temp,headache,bodyache,infection_prob
0,80,1,0,64,0,-1,0
1,23,0,0,86,1,0,1
2,59,1,0,71,1,1,1
3,57,1,1,66,0,-1,0
4,57,1,-1,83,1,-1,1


### Data for Model

In [3]:
features = data.iloc[:,0:-1]
labels = data['infection_prob']
features.tail()

Unnamed: 0,age,throat_pain,breathing_problem,body_temp,headache,bodyache
2910,69,1,1,88,0,-1
2911,27,0,1,40,1,1
2912,58,1,-1,89,0,0
2913,81,1,-1,88,1,-1
2914,69,0,-1,38,1,-1


### Model Imports and Training

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

### Model Training and Prediction

In [6]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [7]:
model.score(X_test, y_test) # Accuracy is 50% ;(

0.5008576329331046

In [8]:
infection_prob = model.predict_proba(X_test)
infection_prob

array([[0.47403899, 0.52596101],
       [0.4613585 , 0.5386415 ],
       [0.4844331 , 0.5155669 ],
       ...,
       [0.55360528, 0.44639472],
       [0.48848514, 0.51151486],
       [0.48106599, 0.51893401]])

### Saving the Model using Pickle

In [9]:
import pickle

model_name = 'covid_predict.sav'
file = open(model_name,'wb')
pickle.dump(model, file)
file.close()