In [66]:
# import libraries algorithm training
from sklearn.model_selection import cross_validate, train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.utils import resample
import pandas as pd
import numpy as np
import joblib

In [32]:
df = pd.read_csv("cleaned_covid.csv")
SEED = 42

In [33]:
# check for duplicates
df = df[df["Result"] != "INDETERMINATE"]
df.duplicated().sum()

np.int64(0)

In [34]:
df.head(2)

Unnamed: 0,Sex,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age
0,MALE,1967,NO,NO,NO,NO,NO,NO,NO,NO,NO,NO,NO,NO,NO,PENDING,58
1,MALE,1988,NO,NO,NO,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,NEGATIVE,37


In [35]:
df.columns

Index(['Sex', 'Birth Year', 'Chest pain', 'Cough', 'Diarrhea',
       'Fatigue or general weakness', 'Fever', 'Headache',
       'Thorax (sore throat)', 'Nausea', 'Runny nose',
       'Sore throat or pharyngitis', 'Vomiting', 'Loss of Taste',
       'Loss of Smell', 'Result', 'age'],
      dtype='object')

#### Feature engineering and transformation

In [36]:
def label_encode_columns(df, columns):
    """
    Label encodes the specified categorical columns in a DataFrame without using sklearn.
    
    Args:
    df (pd.DataFrame): The input DataFrame.
    columns (list): List of column names to label encode.
    
    Returns:
    pd.DataFrame: DataFrame with label-encoded columns.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype("category").cat.codes  # Convert to category codes
    return df

In [37]:
# Columns to encode
columns_to_encode = ['Chest pain', 'Cough', 'Diarrhea','Fatigue or general weakness', 'Fever', 'Headache',
                    'Thorax (sore throat)', 'Nausea', 'Runny nose', 'Sore throat or pharyngitis', 'Vomiting',
                    'Loss of Taste', 'Loss of Smell', 'Result']

# Apply function
df_encoded = label_encode_columns(df, columns_to_encode)

In [38]:
df_encoded

Unnamed: 0,Sex,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age
0,MALE,1967,0,0,0,0,0,0,0,0,0,0,0,0,0,1,58
1,MALE,1988,0,0,0,0,1,0,0,0,0,0,0,0,0,0,37
2,FEMALE,1978,0,1,0,0,0,0,1,0,1,1,0,0,0,0,47
3,MALE,1995,0,1,0,0,0,0,1,0,1,1,0,0,0,0,30
4,MALE,1967,0,0,0,0,1,0,0,0,0,0,0,0,0,0,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,FEMALE,1990,0,0,0,1,0,1,0,0,0,0,0,0,0,0,35
2487,MALE,1995,0,1,0,0,1,0,0,0,0,0,0,0,0,0,30
2488,FEMALE,1999,1,0,1,0,0,0,0,0,0,0,1,0,1,0,26
2489,FEMALE,1995,1,0,0,1,0,1,0,0,0,0,0,1,0,0,30


In [39]:
def one_hot_encode_columns(df, columns):
    """
    One-hot encodes the specified categorical columns in a DataFrame without using sklearn.

    Args:
    df (pd.DataFrame): The input DataFrame.
    columns (list): List of column names to one-hot encode.

    Returns:
    pd.DataFrame: DataFrame with one-hot encoded columns.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    return pd.get_dummies(df, columns=columns, drop_first=False)  # Keep all categories

In [40]:
# Columns to encode
columns_to_encode = ['Sex']

# Apply function
df_encoded = one_hot_encode_columns(df_encoded, columns_to_encode)

In [41]:
df_encoded.head(2)

Unnamed: 0,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age,Sex_FEMALE,Sex_MALE,Sex_UNKNOWN
0,1967,0,0,0,0,0,0,0,0,0,0,0,0,0,1,58,False,True,False
1,1988,0,0,0,0,1,0,0,0,0,0,0,0,0,0,37,False,True,False


In [42]:
# balancing the imbalance dataset
# Print class distribution
print("Original Class Distribution:\n", df_encoded['Result'].value_counts())

Original Class Distribution:
 Result
0    1320
1     664
2     504
Name: count, dtype: int64


In [43]:
# Separate majority and minority classes
classes = df_encoded['Result'].value_counts().index  # Get unique classes
majority_class = df_encoded['Result'].value_counts().idxmax()  # Find majority class
max_size = df_encoded['Result'].value_counts().max()  # Size of majority class

# Upsample minority classes
df_balanced = pd.concat([
    resample(df_encoded[df_encoded['Result'] == cls], replace=True, n_samples=max_size, random_state=42)
    if cls != majority_class else df_encoded[df_encoded['Result'] == cls] 
    for cls in classes
])

In [44]:
# Shuffle dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Print new class distribution
print("\nBalanced Class Distribution:\n", df_balanced['Result'].value_counts())


Balanced Class Distribution:
 Result
0    1320
1    1320
2    1320
Name: count, dtype: int64


In [45]:
# 
X = df_balanced.drop(["Result", "Birth Year", "Sex_UNKNOWN"], axis = 1)
y = df_balanced["Result"]
y.value_counts()

Result
0    1320
1    1320
2    1320
Name: count, dtype: int64

In [46]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = SEED)

In [47]:
# Scale the Data (Normalization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Experimentation without cross validation

In [48]:
# initialize the algorithms for training
lrc = LogisticRegression()
rfc = RandomForestClassifier()
dtc = DecisionTreeClassifier()

##### Logistics regression

In [49]:
# train a Logistic Regression Model
lrc.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
# Make Predictions
y_pred = lrc.predict(X_test_scaled)



In [51]:
class_report = classification_report(y_test, y_pred)
print("\n Logistics Regression Classification Report:\n", class_report)


 Logistics Regression Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.36      0.46       422
           1       0.41      0.78      0.54       385
           2       0.49      0.28      0.35       381

    accuracy                           0.47      1188
   macro avg       0.51      0.47      0.45      1188
weighted avg       0.52      0.47      0.45      1188



##### decision tree classifier

In [52]:
# train a Decision Tree Model
dtc.fit(X_train, y_train)

In [53]:
# Make Predictions
y_pred_dtc = dtc.predict(X_test_scaled)



In [54]:
class_report_dtc = classification_report(y_test, y_pred_dtc)
print("\nDecision Tree Classification Report:\n", class_report_dtc)


Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.43      0.46       422
           1       0.39      0.64      0.49       385
           2       0.44      0.22      0.29       381

    accuracy                           0.43      1188
   macro avg       0.44      0.43      0.41      1188
weighted avg       0.44      0.43      0.41      1188



##### random forest classifier

In [55]:
# train a Random Forest Model
rfc.fit(X_train, y_train)

In [56]:
# Make Predictions
y_pred_rfc = rfc.predict(X_test_scaled)



In [57]:
class_report_rfc = classification_report(y_test, y_pred_rfc)
print("\n Random Forest Classification Report:\n", class_report_rfc)


 Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.52      0.51       422
           1       0.46      0.47      0.46       385
           2       0.46      0.44      0.45       381

    accuracy                           0.48      1188
   macro avg       0.48      0.48      0.48      1188
weighted avg       0.48      0.48      0.48      1188



#### Experimentation with cross validation

In [58]:
k = 5 
kf = KFold(n_splits=k, shuffle=True, random_state=SEED)

In [59]:
# Scale Features
X_scaled = scaler.fit_transform(X)

##### cross validation with logisitic regression

In [60]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores = cross_val_score(lrc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [61]:
# Print Results
print(f"Cross-Validation Scores: {scores}")
print(f"Mean f1 score: {np.mean(scores):.2f}")

Cross-Validation Scores: [0.49180036 0.46359629 0.47321761 0.45389537 0.46528458]
Mean f1 score: 0.47


##### cross validation with decision tree

In [62]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores_dtc = cross_val_score(dtc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [63]:
# Print Results
print(f"Cross-Validation Scores: {scores_dtc}")
print(f"Mean f1 score: {np.mean(scores_dtc):.2f}")

Cross-Validation Scores: [0.64411754 0.64024113 0.65186334 0.65564714 0.64476766]
Mean f1 score: 0.65


##### cross validation with random forest classifier

In [64]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores_rfc = cross_val_score(rfc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [65]:
# Print Results
print(f"Cross-Validation Scores: {scores_rfc}")
print(f"Mean f1 score: {np.mean(scores_rfc):.2f}")

Cross-Validation Scores: [0.67361943 0.66115939 0.68958785 0.70360582 0.67291665]
Mean f1 score: 0.68


In [67]:
# Train the model on the full dataset
rfc.fit(X_scaled, y)

# Export the trained model
joblib.dump(rfc, "random_forest_model.pkl")

print("Model saved successfully as 'random_forest_model.pkl'")

Model saved successfully as 'random_forest_model.pkl'
