In [1]:
import pandas as pd

# Load the dataset
file_path = "WHO-COVID-19-global-daily-data.csv"
df = pd.read_csv(file_path)

# Display initial dataset info
print("Initial Dataset Info:")
df.info()
print("\nFirst 5 Rows:")
print(df.head())

# Step 1: Check for duplicate rows and remove them
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")
df = df.drop_duplicates()

# Step 2: Handle missing values
# Fill missing country codes with "Unknown"
df["Country_code"] = df["Country_code"].fillna("Unknown")

# Fill missing new cases and deaths with 0 (assuming no report means no cases/deaths)
df["New_cases"] = df["New_cases"].fillna(0)
df["New_deaths"] = df["New_deaths"].fillna(0)

# Step 3: Convert Date_reported to datetime format
df["Date_reported"] = pd.to_datetime(df["Date_reported"], errors='coerce')

# Step 4: Verify data types after preprocessing
print("\nDataset Info After Cleaning:")
df.info()

# Save cleaned data to a new file
cleaned_file_path = "cleaned_WHO_COVID19_data.csv"
df.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved as: {cleaned_file_path}")


Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450720 entries, 0 to 450719
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date_reported      450720 non-null  object 
 1   Country_code       448842 non-null  object 
 2   Country            450720 non-null  object 
 3   WHO_region         450720 non-null  object 
 4   New_cases          207873 non-null  float64
 5   Cumulative_cases   450720 non-null  int64  
 6   New_deaths         152814 non-null  float64
 7   Cumulative_deaths  450720 non-null  int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 27.5+ MB

First 5 Rows:
  Date_reported Country_code         Country WHO_region  New_cases  \
0    2020-01-04           AF     Afghanistan        EMR        NaN   
1    2020-01-04           AL         Albania        EUR        NaN   
2    2020-01-04           DZ         Algeria        AFR        NaN   
3    2020-01-04     