# Covid Classification Task

## Data Loading and Preparation

In [None]:

import numpy as np
import pandas as pd
import sklearn
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


### Old Dataset

In [None]:
# import old dataset
df_old = pd.read_csv('input/Yawen-Colonoscopy-Covid Data files/covid-19/covid19_complete_feature&annotation.csv')
df_old.head()

Unnamed: 0,id,recommendation_expert,recommendation_consumer_1,recommendation_consumer_2,undersitandability_expert,medically_informative_expert,information_accuracy_expert,understandability_consumer_1,medical_information_consumer_1,understandability_consumer_2,...,dislikeRate,commentCount,favoriteCount,channelSubscriberCount,channelViewCount,channelCount,channelVideoCount,max_cos_sim_des,max_cos_sim_title,max_cos_sim_tran
0,0pVEO0bkreA.mp4,1,1,0,1,0,1,1,0,0,...,0.0,0,0,0,1595676,0,250,0.316228,0.353553,0.0
1,UEoWRXJT_4I.mp4,0,0,1,1,0,0,1,0,1,...,0.0,0,0,11600,1290325,0,1411,0.280056,0.333333,0.0
2,1mKITi1D0p0.mp4,1,1,1,1,1,1,1,1,1,...,0.000309,0,0,13200,25348907,0,712,0.0,0.57735,0.069505
3,7tgm8KBlCtE.mp4,1,1,1,1,1,1,1,1,1,...,0.000428,29,0,796,841142,0,1043,0.190693,0.353553,0.080064
4,1Yn0pk22pVM.mp4,1,1,1,1,1,1,1,1,1,...,0.0003,0,0,123000,70006556,0,1523,0.190117,0.377964,0.123797


### New Dataset

In [None]:
# concat label to the new dataset
id = df_old["id"].values.tolist()
label = df_old["recommendation_expert"].values.tolist()
covid_dict = dict(list(zip(id, label)))
data_list = []
for filename in os.listdir('temp/covid-feature'):
    id = filename.split(".")[0] + '.mp4'
    if id in covid_dict.keys():
        file_path = os.path.join('temp/covid-feature', filename)
        with open(file_path, 'r') as f:
            data = json.load(f)
        data['label'] = covid_dict[id]
        data_list.append(data)    
df_new = pd.DataFrame(data_list)
df_new['channel_subscribers'] = df_new['channel_subscribers'].astype(int)
df_new.head()
print(df_new.shape)

## Classification

### Classification Model for old dataset

In [None]:
df_old.columns

In [None]:
# data preparation
# Split the data into training and testing sets
drop_list = ['']
X_train, X_test, y_train, y_test = train_test_split(df_old.drop(drop_list, axis=1), df_old['recommendation_expert'], test_size=0.2, random_state=42)

### Classification Model for new dataset

In [None]:
# data preparation
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_new.drop(['id', 'label', 'tags'], axis=1), df_new['label'], test_size=0.2, random_state=42)

In [None]:
# model training
# Train the Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
# Print classification reports for each model
print('Logistic Regression:')
print(classification_report(y_test, lr_pred))

print('Random Forest:')
print(classification_report(y_test, rf_pred))

print('XGBoost:')
print(classification_report(y_test, xgb_pred))

# Plot confusion matrices for each model
fig, axs = plt.subplots(ncols=3, figsize=(15, 5))
axs[0].set_title('Logistic Regression')
ConfusionMatrixDisplay.from_estimator(lr_model, X_test, y_test, ax=axs[0], cmap=plt.cm.Blues, normalize=None)

axs[1].set_title('Random Forest')
ConfusionMatrixDisplay.from_estimator(rf_model, X_test, y_test, ax=axs[1], cmap=plt.cm.Blues, normalize=None)

axs[2].set_title('XGBoost')
ConfusionMatrixDisplay.from_estimator(xgb_model, X_test, y_test, ax=axs[2], cmap=plt.cm.Blues, normalize=None)

plt.show()