<a href="https://colab.research.google.com/github/Mercymerine/decision_tree/blob/main/Covid_Decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Getting the necessary  packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Getting the dataset

In [None]:
#Getting the dataset from kaggle
!kaggle datasets download -d kallolnath1/covid-tested-patient-dataset

Dataset URL: https://www.kaggle.com/datasets/kallolnath1/covid-tested-patient-dataset
License(s): unknown
covid-tested-patient-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
zipfile_path = '/content/covid-tested-patient-dataset.zip'
csv_file = zipfile_path.replace('zip', '')
with zipfile.ZipFile(zipfile_path, 'r') as file:
  file.extractall()
  print('Done')

Done


In [None]:
covid = pd.read_csv('/content/corona_tested_006.csv', low_memory=False)
covid.head()

Unnamed: 0,Ind_ID,Test_date,Cough_symptoms,Fever,Sore_throat,Shortness_of_breath,Headache,Corona,Age_60_above,Sex,Known_contact
0,1,11-03-2020,True,False,True,False,False,negative,,,Abroad
1,2,11-03-2020,False,True,False,False,False,positive,,,Abroad
2,3,11-03-2020,False,True,False,False,False,positive,,,Abroad
3,4,11-03-2020,True,False,False,False,False,negative,,,Abroad
4,5,11-03-2020,True,False,False,False,False,negative,,,Contact with confirmed


In [None]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Ind_ID               278848 non-null  int64 
 1   Test_date            278848 non-null  object
 2   Cough_symptoms       278596 non-null  object
 3   Fever                278596 non-null  object
 4   Sore_throat          278847 non-null  object
 5   Shortness_of_breath  278847 non-null  object
 6   Headache             278847 non-null  object
 7   Corona               278848 non-null  object
 8   Age_60_above         151528 non-null  object
 9   Sex                  259285 non-null  object
 10  Known_contact        278848 non-null  object
dtypes: int64(1), object(10)
memory usage: 23.4+ MB


## Data Cleaning

In [None]:
covid.nunique()

Unnamed: 0,0
Ind_ID,278848
Test_date,51
Cough_symptoms,2
Fever,2
Sore_throat,2
Shortness_of_breath,2
Headache,2
Corona,3
Age_60_above,2
Sex,2


In [None]:
#percentage of missing values
covid.isnull().sum()/len(covid)*100

Unnamed: 0,0
Ind_ID,0.0
Test_date,0.0
Cough_symptoms,0.090372
Fever,0.090372
Sore_throat,0.000359
Shortness_of_breath,0.000359
Headache,0.000359
Corona,0.0
Age_60_above,45.659284
Sex,7.01565


In [None]:
#Dropping the age_60 and above column
columns = ['Age_60_above', 'Ind_ID', 'Test_date']

covid.drop(columns, axis=1, inplace=True)

In [None]:
#Dropping all rows with missing values
covid.dropna(inplace=True)

In [None]:
covid.isnull().sum()

Unnamed: 0,0
Cough_symptoms,0
Fever,0
Sore_throat,0
Shortness_of_breath,0
Headache,0
Corona,0
Sex,0
Known_contact,0


## Encoding

In [None]:
covid['Corona'].unique()

array(['negative', 'positive', 'other'], dtype=object)

In [None]:
#Applying label encoding to the target column

covid_encoding ={
    'positive':1,
    'negative':0,
    'other':-1
    }

covid['Corona'] = covid['Corona'].map(covid_encoding)
covid

Unnamed: 0,Cough_symptoms,Fever,Sore_throat,Shortness_of_breath,Headache,Corona,Sex,Known_contact
13727,True,False,False,False,False,0,male,Abroad
13728,True,False,False,False,False,0,female,Abroad
13730,True,True,False,False,False,1,female,Other
13731,False,False,False,False,False,-1,female,Other
13732,False,False,False,False,False,-1,female,Other
...,...,...,...,...,...,...,...,...
278843,False,False,False,False,False,1,male,Other
278844,False,False,False,False,False,0,female,Other
278845,False,False,False,False,False,0,male,Other
278846,False,False,False,False,False,0,male,Other


In [None]:
#Applying encoding using get dummies in pandas
covid = pd.get_dummies(covid, drop_first=True).astype(int)
covid

Unnamed: 0,Corona,Cough_symptoms_True,Fever_True,Sore_throat_True,Shortness_of_breath_True,Headache_True,Sex_male,Known_contact_Contact with confirmed,Known_contact_Other
13727,0,1,0,0,0,0,1,0,0
13728,0,1,0,0,0,0,0,0,0
13730,1,1,1,0,0,0,0,0,1
13731,-1,0,0,0,0,0,0,0,1
13732,-1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
278843,1,0,0,0,0,0,1,0,1
278844,0,0,0,0,0,0,0,0,1
278845,0,0,0,0,0,0,1,0,1
278846,0,0,0,0,0,0,1,0,1


## Splitting the datset into training and test data

In [None]:
from sklearn.model_selection import train_test_split

X = covid.drop('Corona', axis=1)
y = covid['Corona']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modelling

### Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
dt = DecisionTreeClassifier(criterion='gini')


In [None]:
# Fit the model on the training data
dt.fit(X_train, y_train)

# Predict on the test set
y_pred = dt.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Additional performance metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9575556370514775
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       631
           0       0.97      0.99      0.98     48522
           1       0.76      0.58      0.66      2656

    accuracy                           0.96     51809
   macro avg       0.58      0.52      0.55     51809
weighted avg       0.94      0.96      0.95     51809

Confusion Matrix:
 [[    0   604    27]
 [    0 48070   452]
 [    0  1116  1540]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(criterion='gini')


In [None]:
# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Additional performance metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9575556370514775
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       631
           0       0.97      0.99      0.98     48522
           1       0.76      0.58      0.66      2656

    accuracy                           0.96     51809
   macro avg       0.58      0.52      0.55     51809
weighted avg       0.94      0.96      0.95     51809

Confusion Matrix:
 [[    0   604    27]
 [    0 48069   453]
 [    0  1115  1541]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
