In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
file_path = '/Users/sathwik/Downloads/COVID DEATHS.csv'
data = pd.read_csv(file_path)

# Handling Missing Values
# Fill numerical columns with their median and categorical with the mode
numerical_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number', 'datetime']).columns

for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Correct Data Types
# Convert 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Handling Outliers
# Using the IQR method to identify outliers
Q1 = data[numerical_cols].quantile(0.25)
Q3 = data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Removing outliers - rows that have values outside 1.5 * IQR from Q1 and Q3
data = data[~((data[numerical_cols] < (Q1 - 1.5 * IQR)) | (data[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Data Standardization
# Standardizing numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Displaying the first few rows after data cleaning and standardization
print(data.head())

  iso_code continent     location       date  population  total_cases  \
0      AFG      Asia  Afghanistan 2020-01-03    2.414599    -0.068787   
1      AFG      Asia  Afghanistan 2020-01-04    2.414599    -0.068787   
2      AFG      Asia  Afghanistan 2020-01-05    2.414599    -0.068787   
3      AFG      Asia  Afghanistan 2020-01-06    2.414599    -0.068787   
4      AFG      Asia  Afghanistan 2020-01-07    2.414599    -0.068787   

   new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  \
0  -0.313052           -0.057003     -0.014358   -0.292995  ...   
1  -0.313052           -0.057003     -0.014358   -0.292995  ...   
2  -0.313052           -0.057003     -0.014358   -0.292995  ...   
3  -0.313052           -0.057003     -0.014358   -0.292995  ...   
4  -0.313052           -0.057003     -0.014358   -0.292995  ...   

   new_deaths_smoothed_per_million  reproduction_rate  icu_patients  \
0                        -0.167559           0.014785           0.0   
1               