# COVID-19 mRNA Vaccine Degradation (EDA)

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Data

### Load Data

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/covid-19/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/covid-19/test.csv")

### Data Overview



In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.describe()

In [None]:
train_data.drop('id', axis=1, inplace=True)
train_data.drop('id_seqpos', axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)
test_data.drop('id_seqpos', axis=1, inplace=True)

In [None]:
# Histograms for numerical features
num_cols = train_data.select_dtypes(include=['float64']).columns
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=train_data, x=col, bins=30, kde=True)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title(f'Distribution of {col}')
    plt.show()

### Categorical Features

In [None]:
# Explore categorical columns
cat_cols = train_data.select_dtypes(include=['object']).columns

for col in cat_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=train_data, x=col, order=train_data[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title(f'Counts of {col}')
    plt.show()

### Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = train_data[num_cols].corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


### Box Plots

In [None]:
# Box plots for numerical features
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=train_data, x=col)
    plt.xlabel(col)
    plt.title(f'Box Plot of {col}')
    plt.show()

### Feature Relationships

In [None]:
# Violin plots or bar plots for numerical vs. categorical features
for cat_col in cat_cols:
    plt.figure(figsize=(10, 5))
    sns.violinplot(data=train_data, x=cat_col, y='reactivity')
    plt.xticks(rotation=45)
    plt.xlabel(cat_col)
    plt.ylabel('Reactivity')
    plt.title(f'Reactivity vs. {cat_col}')
    plt.show()
