In [18]:
import pandas as pd

# Load the datasets
df_pop = pd.read_csv('population per country from 1960.csv')
df_co2 = pd.read_csv('co2_emissions_kt_by_country.csv')

# Reshape the population dataframe from wide to long format
df_pop_long = df_pop.melt(id_vars=["Country Name"], var_name="year", value_name="population")

# Rename columns to be consistent
df_pop_long.rename(columns={"Country Name": "country"}, inplace=True)

# Standardize column names and data types for the CO2 emissions dataframe
df_co2.rename(columns={"country_name": "country", "value": "co2_emission"}, inplace=True)

# Convert year to integer in population dataframe and co2_emission dataframe
df_pop_long["year"] = df_pop_long["year"].astype(int)
df_co2["year"] = df_co2["year"].astype(int)

# Remove duplicates in both dataframes
df_pop_long.drop_duplicates(inplace=True)
df_co2.drop_duplicates(inplace=True)

# Handle missing values by dropping rows with missing values
df_pop_long.dropna(inplace=True)
df_co2.dropna(inplace=True)

# Standardize country names
df_pop_long['country'] = df_pop_long['country'].replace({"Viet Nam": "Vietnam", "Türkiye": "Turkey"})
df_co2['country'] = df_co2['country'].replace({"Viet Nam": "Vietnam", "Türkiye": "Turkey"})

# Drop rows with regions that are not typically classified as countries
non_country_regions = [
    'American Samoa', 'Channel Islands', 'Guam', 'Isle of Man', 'Monaco',
    'Northern Mariana Islands', 'Puerto Rico', 'San Marino',
    'St. Martin (French part)', 'Virgin Islands (U.S.)', 'Not classified'
]

df_pop_long = df_pop_long[~df_pop_long['country'].isin(non_country_regions)]
df_co2 = df_co2[~df_co2['country'].isin(non_country_regions)]

# Final check for discrepancies
unique_countries_pop = set(df_pop_long['country'].unique())
unique_countries_co2 = set(df_co2['country'].unique())

# Find remaining discrepancies
final_discrepancies = unique_countries_pop.symmetric_difference(unique_countries_co2)

# Display final discrepancies (should be empty)
print(final_discrepancies)  # Should be empty

# Merge the two dataframes on country and year
data = pd.merge(df_pop_long, df_co2, on=['country', 'year'], how='inner')

# Display the first few rows of the merged dataframe
print(data.head())

# Save the merged dataframe to a CSV file
data.to_csv('merged_population_co2.csv', index=False)

{'Turkey'}
                       country  year   population country_code   co2_emission
0                        Aruba  1960      54608.0          ABW   11092.675000
1  Africa Eastern and Southern  1960  130692579.0          AFE  118545.901306
2                  Afghanistan  1960    8622466.0          AFG     414.371000
3   Africa Western and Central  1960   97256290.0          AFW    8760.463000
4                       Angola  1960    5357195.0          AGO     550.050000


In [21]:
# Step 1: Clean the file by removing NA and duplicated values
data_cleaned = data.dropna().drop_duplicates()

# Standardize the country names (remove leading/trailing whitespace)
data_cleaned['country'] = data_cleaned['country'].str.strip()

# Step 2: Remove the '.0' from the end of the population numbers
data_cleaned['population'] = data_cleaned['population'].astype(str).str.replace('.0', '', regex=False).astype(int)

# Step 3: Drop the country code column
data_cleaned = data_cleaned.drop(columns=['country_code'])

# Step 4: Ensure correct data types for remaining columns
data_cleaned['year'] = data_cleaned['year'].astype(int)
data_cleaned['co2_emission'] = data_cleaned['co2_emission'].astype(float)

# Optionally, save the cleaned data to a new CSV file
data_cleaned.to_csv('cleaned_population_co2.csv', index=False)
print(data_cleaned)


                           country  year  population   co2_emission
0                            Aruba  1960       54608   11092.675000
1      Africa Eastern and Southern  1960   130692579  118545.901306
2                      Afghanistan  1960     8622466     414.371000
3       Africa Western and Central  1960    97256290    8760.463000
4                           Angola  1960     5357195     550.050000
...                            ...   ...         ...            ...
13918                        Samoa  2019      211905     300.000012
13919                  Yemen, Rep.  2019    31546691   11100.000381
13920                 South Africa  2019    58087055  439640.014648
13921                       Zambia  2019    18380477    6800.000191
13922                     Zimbabwe  2019    15354608   11760.000229

[13923 rows x 4 columns]
