<a href="https://colab.research.google.com/github/LS10432/PDDS-PYD-0825-06Oct2025B/blob/main/Project/A3/A3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('/content/owid-covid-data.csv')
print(f"Number of columns before removal: {df.shape[1]}")
print(f"Total number of rows: {df.shape[0]}")

# Check for missing values and identify columns to drop
missing_values_percentage = df.isnull().mean() * 100
columns_to_drop = missing_values_percentage[missing_values_percentage > 90].index.tolist()
print("\nColumns with more than 90% missing values:")
print(columns_to_drop)
print(f"\nNumber of columns to be dropped: {len(columns_to_drop)}")

# Remove columns
df = df.drop(columns=columns_to_drop)

print(f"\nNumber of columns after removal: {df.shape[1]}")
display(df.columns)

# convert 'date' column to datetime data type
df['date'] = pd.to_datetime(df['date'])
print("\n" + "="*60)
print("\nColumn info after datetime conversion:")
df.info()

# Remove duplicate rows
rows_before = df.shape[0]
duplicate_rows = df.duplicated().sum()
unique_duplicate_rows = df[df.duplicated()].shape[0]
df = df.drop_duplicates()
rows_after = df.shape[0]

print(f"\nNumber of rows before removing duplicates: {rows_before}")
print(f"Number of duplicate rows: {duplicate_rows}")
print(f"Number of unique duplicate rows: {unique_duplicate_rows}")
print(f"Number of rows after removing duplicates: {rows_after}")


# Print the number of missing values per column before imputation
print("\nNumber of missing values per column before imputation:")
print(df.isnull().sum())

# Select numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Impute missing values in numerical columns using the mean
for col in numerical_cols:
    if df[col].isnull().any():
        mean_val = df[col].mean()
        df[col] = df[col].fillna(mean_val)

# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values in categorical columns using the mode
for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0] # mode() can return multiple values, take the first
        df[col] = df[col].fillna(mode_val)


# Print the number of remaining missing values per column
print("\nNumber of remaining missing values per column after imputation:")
print(df.isnull().sum())

# Reasoning for imputation methods
print("\nReasoning for imputation methods:")
print("Mean is used for numerical columns because it represents the central tendency of the data and is a common method for imputing missing numerical values when the data is not heavily skewed.")
print("Mode is used for categorical columns because it represents the most frequent category, which is a suitable way to fill missing values in discrete data where the concept of mean is not applicable.")


# Extract year and month from the 'date' column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Explain the reasoning
print("\nReasoning for extracting year and month:")
print("Extracting year and month from the date column is crucial for time-series analysis. It allows for grouping data by specific time periods, identifying seasonal trends, and comparing metrics across different years and months. This granularity in time data is essential for understanding patterns and changes in the COVID-19 data over time.")

# Get unique countries from the 'location' column
unique_countries = df['location'].unique()

# Count the number of unique countries
num_unique_countries = len(unique_countries)

# Print the total count of unique countries
print(f"\nTotal number of unique countries: {num_unique_countries}")

# Display the list of unique countries (optional, for verification)
print("\nList of unique countries:")
print(unique_countries)

Number of columns before removal: 67
Total number of rows: 71110

Columns with more than 90% missing values:
['hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative', 'excess_mortality', 'excess_mortality_cumulative_per_million']

Number of columns to be dropped: 10

Number of columns after removal: 57


Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
       'new_vaccinations_smoothed_per_million',
    



Column info after datetime conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71110 entries, 0 to 71109
Data columns (total 57 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   iso_code                                    71110 non-null  object        
 1   continent                                   68262 non-null  object        
 2   location                                    71110 non-null  object        
 3   date                                        71110 non-null  datetime64[ns]
 4   total_cases                                 66579 non-null  float64       
 5   new_cases                                   71060 non-null  float64       
 6   new_cases_smoothed                          70805 non-null  float64       
 7   total_deaths                                63953 non-null  float64       
 8   new_deaths                                  7