In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the new dataset
vacc= '/Users/sathwik/Downloads/owid-covid-data.csv'
data = pd.read_csv(vacc)

# Handling Missing Values
# Fill numerical columns with their median and categorical with the mode
numerical_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number', 'datetime']).columns

for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Correct Data Types (if needed)
# Convert date columns to datetime format if applicable
# Example:
# data['date_column'] = pd.to_datetime(data['date_column'], errors='coerce')

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Handling Outliers
# Using the IQR method to identify outliers
Q1 = data[numerical_cols].quantile(0.25)
Q3 = data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Removing outliers - rows that have values outside 1.5 * IQR from Q1 and Q3
data = data[~((data[numerical_cols] < (Q1 - 1.5 * IQR)) | (data[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Data Standardization
# Standardizing numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Displaying the first few rows after data cleaning and standardization
print(data.head())

     iso_code      continent             location        date  total_cases  \
3065      ATG  North America  Antigua and Barbuda  2020-03-13    -0.334691   
3066      ATG  North America  Antigua and Barbuda  2020-03-14    -0.334691   
3067      ATG  North America  Antigua and Barbuda  2020-03-15    -0.334691   
3068      ATG  North America  Antigua and Barbuda  2020-03-16    -0.334691   
3069      ATG  North America  Antigua and Barbuda  2020-03-17    -0.334691   

      new_cases  new_cases_smoothed  total_deaths  new_deaths  \
3065  -0.297807            0.229998      0.457188     0.26426   
3066  -0.303979            0.229998      0.457188     0.26426   
3067  -0.303979            0.229998      0.457188     0.26426   
3068  -0.303979            0.229998      0.457188     0.26426   
3069  -0.303979            0.229998      0.457188     0.26426   

      new_deaths_smoothed  ...  extreme_poverty  cardiovasc_death_rate  \
3065             0.307613  ...          0.04855              -1.17