In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Define the file path
file_path = '/content/drive/My Drive/Big Data/reported.csv'

# Load the CSV file
df = pd.read_csv(file_path)


In [None]:
# Melt the DataFrame so that the years become a single column
df_melted = df.melt(
    id_vars=['RegionCode', 'RegionName', 'HealthTopic', 'Population', 'Indicator', 'Unit'],
    var_name='Year',
    value_name='Cases'
)

# Select and rename the columns
df_final = df_melted[['RegionName', 'Year', 'Cases']].rename(columns={'RegionName': 'Country'})

# Convert Year to numeric
df_final['Year'] = df_final['Year'].str.extract('(\d+)').astype(int)

# Save the final DataFrame to a new CSV file within the same folder
output_path = '/content/drive/My Drive/Big Data/transformed_data.csv'
df_final.to_csv(output_path, index=False)

print(f"Transformation complete. The new CSV file is saved as 'transformed_data.csv' in the same folder.")


Transformation complete. The new CSV file is saved as 'transformed_data.csv' in the same folder.


In [None]:
# Check the contents of the output file
transformed_df = pd.read_csv(output_path)
transformed_df.head()

Unnamed: 0,Country,Year,Cases
0,Austria,2007,9.0
1,Belgium,2007,8.0
2,Bulgaria,2007,16.0
3,Cyprus,2007,0.0
4,Czechia,2007,24.0


In [None]:
# Define the file path
deaths_file_path = '/content/drive/My Drive/Big Data/deaths.csv'

# Load the CSV file
deaths_df = pd.read_csv(deaths_file_path)

In [None]:
# Melt the DataFrame so that the years become a single column
deaths_melted = deaths_df.melt(
    id_vars=['RegionCode', 'RegionName', 'HealthTopic', 'Population', 'Indicator', 'Unit'],
    var_name='Year',
    value_name='Deaths'
)

# Select and rename the columns
deaths_final = deaths_melted[['RegionName', 'Year', 'Deaths']].rename(columns={'RegionName': 'Country'})

# Convert Year to numeric
deaths_final['Year'] = deaths_final['Year'].str.extract('(\d+)').astype(int)


In [None]:
# Merge the DataFrames on 'Country' and 'Year'
merged_df = pd.merge(df_final, deaths_final, on=['Country', 'Year'], how='left')

In [None]:
# Replace '-' and empty values with NA
merged_df.replace(['-', ''], pd.NA, inplace=True)

# Convert Cases and Deaths to numeric while keeping NA
merged_df['Cases'] = pd.to_numeric(merged_df['Cases'], errors='coerce')
merged_df['Deaths'] = pd.to_numeric(merged_df['Deaths'], errors='coerce')

# Convert to integer type, keeping NA values
merged_df['Cases'] = merged_df['Cases'].astype('Int64')
merged_df['Deaths'] = merged_df['Deaths'].astype('Int64')

In [None]:
# Save the final DataFrame to a new CSV file
output_path = '/content/drive/My Drive/Big Data/World.csv'
merged_df.to_csv(output_path, index=False)


In [None]:
# Check the contents of the output file
transformed_df = pd.read_csv(output_path)
transformed_df.head()

Unnamed: 0,Country,Year,Cases,Deaths
0,Austria,2007,9.0,0.0
1,Belgium,2007,8.0,0.0
2,Bulgaria,2007,16.0,0.0
3,Cyprus,2007,0.0,0.0
4,Czechia,2007,24.0,
