In [None]:
import numpy as np
import pandas as pd 
import os 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames: 
        print(os.path.join(dirname, filename))

In [None]:
covid = pd.read_csv('/kaggle/input/covid19-dataset/Covid Data.csv')
print('Data Shape :', covid.shape)

* sex: 1 for female and 2 for male.
* age: of the patient.
* classification: covid test findings. Values 1-3 mean that the patient was diagnosed with covid in different
* degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.
* patient type: type of care the patient received in the unit. 1 for returned home and 2 for hospitalization.
* pneumonia: whether the patient already have air sacs inflammation or not.
* pregnancy: whether the patient is pregnant or not.
* diabetes: whether the patient has diabetes or not.
* copd: Indicates whether the patient has Chronic obstructive pulmonary disease or not.
* asthma: whether the patient has asthma or not.
* inmsupr: whether the patient is immunosuppressed or not.
* hypertension: whether the patient has hypertension or not.
* cardiovascular: whether the patient has heart or blood vessels related disease.
* renal chronic: whether the patient has chronic renal disease or not.
* other disease: whether the patient has other disease or not.
* obesity: whether the patient is obese or not.
* tobacco: whether the patient is a tobacco user.
* usmr: Indicates whether the patient treated medical units of the first, second or third level.
* medical unit: type of institution of the National Health System that provided the care.
* intubed: whether the patient was connected to the ventilator.
* icu: Indicates whether the patient had been admitted to an Intensive Care Unit.
* date died: If the patient died indicate the date of death, and 9999-99-99 otherwise.

In [None]:
covid.head()

In [None]:
covid.info()

In [None]:
covid.describe()

**Number of Unique Values by column**

In [None]:
for col in covid.columns : 
    print('{:<20} => {:>10}'.format(col, len(covid[col].unique())))

In [None]:
print('Percentage of missing values')
for col in covid.columns :
    print('{:<20} => {:>10.2f}%'.format(col, len(covid[(covid[col]==98) | (covid[col]==99) | (covid[col]==97)])/len(covid)*100))

In [None]:
covid['TOBACCO'].value_counts()

In [None]:
covid['INTUBED'].value_counts()

In [None]:
covid['PREGNANT'].value_counts()

In [None]:
covid['ICU'].value_counts()

* INTUBED, PREGNANT, ICU : have many missing values, indicated by 97,99
* other columns have smaller missing values, indicated by 98

In [None]:
covid['CLASIFFICATION_FINAL'].value_counts()

* ***1-3*** : the patient was **diagnosed with covid** in different degrees
* ***4-7*** : the patient is **not a carrier of covid** or that the **test is inconclusive**

### Data Preprocessing

Get rid of missing values
* except for INTUBED, PREGNANT, ICU columns since they have too many

In [None]:
covid.columns

In [None]:
cols = ['PNEUMONIA','DIABETES', 'COPD', 'ASTHMA', 'INMSUPR','HIPERTENSION', 
        'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY','RENAL_CHRONIC', 'TOBACCO']
for col in cols :
    covid = covid[(covid[col] == 1)|(covid[col] == 2)]

'DATE_DIED' column to binary 'DEATH' column

In [None]:
covid['DEATH'] = [2 if row=='9999-99-99' else 1 for row in covid['DATE_DIED']]

In [None]:
covid['DEATH'].value_counts()

In [None]:
covid.drop(columns=['INTUBED','ICU','DATE_DIED'],inplace=True)

In [None]:
print('MALE',covid.query('SEX==2')['PREGNANT'].value_counts(), sep='\n')
print('FEMALE',covid.query('SEX==1')['PREGNANT'].value_counts(), sep='\n')

In [None]:
covid['PREGNANT'] = covid['PREGNANT'].replace(97,2)
covid['PREGNANT'] = covid['PREGNANT'].replace(98,2)

**Check the Dataset after preprocessing :**

In [None]:
covid.shape

* DataFrame length reduced from 1,048,575 to 1,025,152

In [None]:
for col in covid.columns : 
    print('{:<20} => {:>10}'.format(col, len(covid[col].unique())))

## Data Visualization, EDA

In [None]:
import  matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7,5))
plt.title('Death Distribution', fontsize=18)
ax = sns.countplot(covid['DEATH'], palette="ch:start=.2,rot=-.3")
plt.bar_label(ax.containers[0])

In [None]:
plt.title('Age Distribution', fontsize=18)
sns.histplot(data=covid, x=covid['AGE'],bins=30, kde=True)

In [None]:
sns.boxplot(x="DEATH", y="AGE",hue="SEX",data=covid,palette=sns.color_palette(["#2f4f4f","#eedd82"]))
plt.title("Age-Death-Sex",fontsize=18)
plt.legend(loc="best")

In [None]:
ax=sns.countplot(covid.OBESITY,hue=covid.DEATH, palette='ch:start=.2,rot=-.3')
plt.title("Obesity-Death",fontsize=18, color="red")
plt.bar_label(ax.containers[0])
plt.bar_label(ax.containers[1])
plt.legend(loc="best");

In [None]:
plt.figure(figsize=(18,15))
sns.heatmap(covid.corr(), annot=True, fmt='.2f')
plt.title('Correlation Between Features', fontsize=18)

* Drop values that doesn't have a **positive correlataion with DEATH column**

Scale Numeric feature(i.e. Age)

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
covid['AGE'] = standard_scaler.fit_transform(covid.loc[:,['AGE']])

**Determine X,Y data**

In [None]:
y = covid['DEATH']
x = covid.drop('DEATH', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=0.2, random_state=42)

print('Train X :', train_x.shape)
print('Test X :', test_x.shape)
print('Train Y :', train_y.shape)
print('Test Y :', test_y .shape)

## Training the Model

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_x, train_y)
print("Logistic Regression Accuracy :", log_reg.score(test_x, test_y))

In [None]:
from sklearn.metrics import f1_score

print("Logistic Regression F1 Score : ", f1_score(test_y, log_reg.predict(test_x),average=None))

In [None]:
from sklearn.metrics import confusion_matrix

sns.heatmap(confusion_matrix(test_y, log_reg.predict(test_x)), annot=True, fmt='.0f')
plt.title("Logistic Regression Confusion Matrix", fontsize=18)

In [None]:
ax = sns.countplot(x=covid['DEATH'],palette='ch:start=.2,rot=-.3')
plt.bar_label(ax.containers[0])
plt.title('Death Distribution', fontsize=18)

* There is an imbalance Dataset Problem
* Use Undersampling method to balance out the dataset

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rand_under = RandomUnderSampler(random_state=0)
x_resampled, y_resampled = rand_under.fit_resample(x,y)

In [None]:
ax = sns.countplot(x=y_resampled,palette='ch:start=.2,rot=-.3')
plt.bar_label(ax.containers[0])
plt.title("Death Distribution After Resampling", fontsize=16)

### Train the model after undersampling

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
print('Train X :', train_x.shape)
print('Test X :', test_x.shape)
print('Train Y :', train_y.shape)
print('Test Y :', test_y .shape)

In [None]:
log_reg.fit(train_x, train_y)
print("Logistic Regression Accuracy :", log_reg.score(test_x, test_y))

In [None]:
from sklearn.metrics import f1_score

print("Logistic Regression F1 Score : ", f1_score(test_y, log_reg.predict(test_x),average=None))

In [None]:
from sklearn.metrics import confusion_matrix

sns.heatmap(confusion_matrix(test_y, log_reg.predict(test_x)), annot=True, fmt='.0f')
plt.title("Logistic Regression Confusion Matrix", fontsize=18)

### Test on other models

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_x, train_y)

In [None]:
from sklearn.metrics import accuracy_score

print('Random Forest Accuracy Score :', accuracy_score(test_y,random_forest.predict(test_x)))

In [None]:
sns.heatmap(confusion_matrix(test_y, random_forest.predict(test_x)), annot=True, fmt='.0f')
plt.title("Random Forest Confusion Matrix", fontsize=18)

* Logistic Regression Model seems to have better **sensitivity**
* which is important since we need to find out who is at risk and take medical measures

# Conclustion

* predict patients who are at high risk of death from covid
* uses undersampling method to solve imbalance dataset problem
* tested logistic regression and random forest algorithm
* logistic regression seemed to have better sensitivity than random forest