In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Using KNN classifier for demonstration purposes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [20]:
df_path = '/kaggle/input/covid19-dataset/Covid Data.csv'

df = pd.read_csv(df_path)
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


In [21]:
# Checking the number of identical records
df.duplicated().sum()

812049

In [22]:
df = df.drop('DATE_DIED', axis=1)

In [23]:
# Dropping duplicated rows
df = df.drop_duplicates()
# Resetting index
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,97,1,72,97,2,2,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,1,2,55,97,1,2,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,97,2,68,97,1,2,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197458,1,13,2,1,97,2,47,97,1,2,2,2,2,2,2,2,2,2,7,97
197459,1,13,1,2,2,2,56,2,1,2,2,2,2,2,2,2,2,2,7,2
197460,1,13,2,2,2,2,51,97,2,2,2,2,1,2,2,2,2,2,7,2
197461,2,13,2,1,97,2,55,97,2,2,2,2,2,2,2,2,2,2,7,97


In [24]:
df.CLASIFFICATION_FINAL.value_counts()

CLASIFFICATION_FINAL
3    77051
7    71212
6    30528
5    10733
1     4636
2     1688
4     1615
Name: count, dtype: int64

In [25]:
"""
As the range of "CLASIFFICATION_FINAL" column varies away,
while the pationt is in covid-19 when his classification value below 4,
here is the "binarize_diagnosis" function that maps the column into binary values
where 1 for covid cariers and 0 for not cariers
"""
def binarize_diagnosis(
    classification_value : int
    ) -> int:
    return int(classification_value < 4)

In [26]:
# apply the "binarize_diagnosis" function over the "CLASIFFICATION_FINAL" column
df.CLASIFFICATION_FINAL = df.CLASIFFICATION_FINAL.apply(binarize_diagnosis)

df.CLASIFFICATION_FINAL.value_counts()

CLASIFFICATION_FINAL
0    114088
1     83375
Name: count, dtype: int64

In [27]:
# Chosing target feature
target = "CLASIFFICATION_FINAL"

# Splitting data
X = df.drop(target, axis=1)
y = df[target]

# Printing X
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197463 entries, 0 to 197462
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   USMER           197463 non-null  int64
 1   MEDICAL_UNIT    197463 non-null  int64
 2   SEX             197463 non-null  int64
 3   PATIENT_TYPE    197463 non-null  int64
 4   INTUBED         197463 non-null  int64
 5   PNEUMONIA       197463 non-null  int64
 6   AGE             197463 non-null  int64
 7   PREGNANT        197463 non-null  int64
 8   DIABETES        197463 non-null  int64
 9   COPD            197463 non-null  int64
 10  ASTHMA          197463 non-null  int64
 11  INMSUPR         197463 non-null  int64
 12  HIPERTENSION    197463 non-null  int64
 13  OTHER_DISEASE   197463 non-null  int64
 14  CARDIOVASCULAR  197463 non-null  int64
 15  OBESITY         197463 non-null  int64
 16  RENAL_CHRONIC   197463 non-null  int64
 17  TOBACCO         197463 non-null  int64
 18  ICU 

In [28]:
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: Index([], dtype='object')


In [29]:
# Convert non-numeric columns to numeric (example: using one-hot encoding)
X_encoded = pd.get_dummies(X, columns=non_numeric_cols)

# Convert to numpy array
X = X_encoded.values.astype(float)

## To implement Linear Discriminant Analysis (LDA) from scratch, we need to follow these steps:

1. Compute the mean vectors for each class.
2. Compute the within-class scatter matrix.
3. Compute the between-class scatter matrix.
4. Compute the eigenvectors and eigenvalues of the matrix (S_w^-1 * S_b).
5. Choose the k eigenvectors that correspond to the k largest eigenvalues to form the projection matrix W.
6. Project the samples onto the new feature subspace.

### Linear Discriminant Analysis (LDA) Equations:

#### Within-Class Scatter Matrix:
\[ S_W = sum_{c=1}^{C} sum_{i=1}^{n_c} (x_i^c - m_c)(x_i^c - m_c)^T \]
where:
- \( c \) is the class label.
- \( n_c \) is the number of samples in class \( c \).
- \( x_i^c \) is the \( i \)-th sample in class \( c \).
- \( m_c \) is the mean vector of class \( c \) samples.

#### Between-Class Scatter Matrix:
\[ S_B = \sum_{c=1}^{C} n_c (m_c - m)(m_c - m)^T \]
where:
- \( m \) is the overall mean vector of all samples.

#### Solving Eigenvalue Problem:
\[ S_W^{-1} S_B w = \lambda w \]
where:
- \( w \) is the eigenvector corresponding to the largest eigenvalue \( \lambda \).

In [30]:
import numpy as np

class LinearDiscriminantAnalysis:
    def __init__(self):
        self.W = None
        
    def fit(self, X, y):
        # Compute class means
        self.class_means = {}
        for c in np.unique(y):
            self.class_means[c] = np.mean(X[y == c], axis=0)
        
        # Compute within-class scatter matrix
        self.S_w = np.zeros((X.shape[1], X.shape[1]))
        for c in self.class_means.keys():
            class_scatter = np.zeros((X.shape[1], X.shape[1]))
            for x in X[y == c]:
                x, mean = x.reshape(-1, 1), self.class_means[c].reshape(-1, 1)
                class_scatter += (x - mean).dot((x - mean).T)
            self.S_w += class_scatter
        
        # Compute between-class scatter matrix
        overall_mean = np.mean(X, axis=0).reshape(-1, 1)
        self.S_b = np.zeros((X.shape[1], X.shape[1]))
        for c in self.class_means.keys():
            n = X[y == c].shape[0]
            class_mean = self.class_means[c].reshape(-1, 1)
            self.S_b += n * (class_mean - overall_mean).dot((class_mean - overall_mean).T)
        
        # Compute eigenvalues and eigenvectors of (S_w^-1 * S_b)
        eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(self.S_w).dot(self.S_b))
        
        # Choose k eigenvectors corresponding to k largest eigenvalues
        self.W = eig_vecs[:, np.argsort(eig_vals)[::-1][:1]]
        
    def transform(self, X):
        # Transform data using LDA and return only the real part
        return np.real(np.dot(X, self.W))

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# Apply LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)

In [35]:
# Feature selection using Random Forest feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)

rf.fit(X_train, y_train)
knn.fit(X_train_lda, y_train)

# Predict the labels
y_pred_knn = knn.predict(X_test_lda)
y_pred_rf = knn.predict(X_test_lda)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [36]:
# Evaluate the performance
print('Accuracy:', accuracy_score(y_test, y_pred_knn))
print('Classification Report:\n', classification_report(y_test, y_pred_knn))

Accuracy: 0.49774641705633116
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.61      0.58     34165
           1       0.39      0.35      0.37     25074

    accuracy                           0.50     59239
   macro avg       0.48      0.48      0.48     59239
weighted avg       0.49      0.50      0.49     59239



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [37]:
# Evaluate the performance
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Classification Report:\n', classification_report(y_test, y_pred_rf))

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy: 0.49774641705633116
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.61      0.58     34165
           1       0.39      0.35      0.37     25074

    accuracy                           0.50     59239
   macro avg       0.48      0.48      0.48     59239
weighted avg       0.49      0.50      0.49     59239



In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report

# # Hyperparameter tuning for LDA
# lda_params = {'solver': ['svd', 'lsqr', 'eigen']}
# lda = LinearDiscriminantAnalysis()
# lda_grid = GridSearchCV(lda, lda_params, cv=5)
# lda_grid.fit(X_train, y_train)
# best_lda = lda_grid.best_estimator_

# # Apply LDA
# X_train_lda = best_lda.transform(X_train)
# X_test_lda = best_lda.transform(X_test)

# # Hyperparameter tuning for KNN
# knn_params = {'n_neighbors': [3, 5, 7, 9, 11]}
# knn = KNeighborsClassifier()
# knn_grid = GridSearchCV(knn, knn_params, cv=5)
# knn_grid.fit(X_train_lda, y_train)
# best_knn = knn_grid.best_estimator_

# # Evaluate the performance
# y_pred = best_knn.predict(X_test_lda)
# print('Accuracy:', accuracy_score(y_test, y_pred))
# print('Classification Report:\n', classification_report(y_test, y_pred))
