In [1]:
#set the columns name to lower case, strip and replace empty spaces
def column_names(df_name):
    new_column_names = [column.strip().lower().replace(" ", "_") for column in df_name.columns]
    df_name.columns = new_column_names

#change the name of a column
def change_column_name(df, old_name, new_name):
    df.rename(columns={old_name: new_name}, inplace=True)
    return df 

#drop one column
def drop_column(df, column):
    df = df.drop(column,axis='columns')
    return df
#drop one row by the index
def drop_row_index(df, df_index):
    df = df.drop(index=df_index)
    return df
#change to numeric 
def to_numeric(df, columns_to_proces):
    for column in columns_to_proces: 
        df[column] = pd.to_numeric(df[column], errors='coerce')


# 1. Cleaning-Data

In [2]:
# Import pandas
import pandas as pd
#from functions import column_names

In [3]:
# Load the datasets
df_pop = pd.read_csv('../Data/Raw data/1. population per country from 1960.csv')
df_co2 = pd.read_csv('../Data/Raw data/1. co2_emissions_kt_by_country.csv')

In [4]:
# Reshape the population dataframe from wide to long format
df_pop = df_pop.melt(id_vars=["Country Name"], var_name="year", value_name="population")

## Correct the column names

In [5]:
# setting the columns names to lower case and eliminating empty spaces
df_pop.column = column_names(df_pop)
df_co2.column = column_names(df_co2)
# Change the country_name to countries so we can merge there
old_name = "country_name"
new_name = "country"
df_pop = change_column_name(df_pop, old_name, new_name)
df_co2 = change_column_name(df_co2, old_name, new_name)
# Rename columns names to be consistent
old_name1 = "value"
new_name1 = "co2_emission"
df_co2 = change_column_name(df_co2, old_name1, new_name1)

# Drop the columns we won't use

In [6]:
column_to_drop = ['country_code']
df_co2 = drop_column(df_co2, column_to_drop)

## 1.1 Null-Values

### Cleaning Null Values
#### 1. Checking for Null Values:

In [7]:
df_pop.isnull().any()

country       False
year          False
population    False
dtype: bool

In [8]:
df_co2.isnull().any()

country         False
year            False
co2_emission    False
dtype: bool

### Check for NaN values

In [9]:
#check if this is ok
df_pop.isnull().any()

country       False
year          False
population    False
dtype: bool

In [10]:
df_co2.isnull().any()

country         False
year            False
co2_emission    False
dtype: bool

### Drop duplicates

In [11]:
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     16758 non-null  object 
 1   year        16758 non-null  object 
 2   population  16758 non-null  float64
dtypes: float64(1), object(2)
memory usage: 392.9+ KB


In [24]:
df_pop = df_pop.drop_duplicates()

In [13]:
df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13953 entries, 0 to 13952
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       13953 non-null  object 
 1   year          13953 non-null  int64  
 2   co2_emission  13953 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 327.2+ KB


In [26]:
df_co2 = df_co2.drop_duplicates()

## Change the data types

In [15]:
#examine the current data types
df_pop.dtypes

country        object
year           object
population    float64
dtype: object

In [16]:
#examine the current data types
df_co2.dtypes

country          object
year              int64
co2_emission    float64
dtype: object

In [22]:
#change the data type to numeric values
pop_columns_to_proces = ['year','population']
to_numeric(df_pop, pop_columns_to_proces)

co2_columns_to_proces = ['year','co2_emission']
to_numeric(df_co2, co2_columns_to_proces)

## Standardize country names

In [31]:
country_name_mapping =  {
    'Afghanistan' : 'Afghanistan',
    'Afghanistan' : 'Afghanistan',
    "Islamic Rep. of', Albania" : 'Albania',
    'Algeria' : 'Algeria',
    'Andorra' : 'Andorra',
    'Andorra, Principality of': 'Andorra',
    'Angola' : 'Angola',
    'Antigua and Barbuda' : 'Antigua and Barbuda',
    'Argentina' : 'Argentina' ,
    'Armenia' : 'Armenia',
    'Armenia, Rep. of' : 'Armenia',
    'Aruba, Kingdom of the Netherlands' :'Aruba',
    'Australia' : 'Australia',
    'Austria' : 'Austria',
    'Azerbaijan' : 'Azerbaijan',
    'Azerbaijan, Rep. of' : 'Azerbaijan',
    'Bahamas, The' : 'Bahamas',
    'Bahamas, The' : 'Bahamas',
    'Bahrain' : 'Bahrain',
    'Bahrain, Kingdom of' : 'Bahrain',
    'Bangladesh' : 'Bangladesh',
    'Barbados' : 'Barbados',
    'Barbados' : 'Barbados',
    'Belarus' : 'Belarus',
    'Belarus, Rep. of' : 'Belarus',
    'Belgium' : 'Belgium',
    'Belize' :'Belize',
    'Benin' : 'Benin',
    'Bhutan' : 'Bhutan',
    'Bolivia' : 'Bolivia',
    'Bosnia and Herzegovina' : 'Bosnia and Herzegovina',
    'Botswana' : 'Botswana',
    'Brazil' : 'Brazil',
    'British Virgin Islands' : 'British Virgin Islands',
    'Central African Republic' :'Central African Republic',
    'Central African Rep.' : 'Central African Republic',
    'Comoros' : 'Comoros',
    'Comoros, Union of the' : 'Comoros',
    'Congo, Rep.' : 'Congo, Rep.',
    'Congo, Rep. of' : 'Congo, Rep.',
    'Croatia, Rep. of' : 'Croatia',
    'Dominican Republic' : 'Dominican Rep.',
    'Egypt, Arab Rep.' : 'Egypt, Arab Rep. of',
    'Equatorial Guinea, Rep. of' : 'Equatorial Guinea',
    'Eritrea, The State of' : 'Eritrea',
    'Estonia, Rep. of' : 'Estonia',
    'Eswatini, Kingdom of' : 'Eswatini',
    'Ethiopia, The Federal Dem. Rep. of' : 'Ethiopia',
    'Fiji, Rep. of' : 'Fiji',
    'Kazakhstan, Rep. of' : 'Kazakhstan',
    "Korea, Dem. People's Rep. of" :"Korea, Dem. People's Rep.",
    'Korea, Rep. of' : 'Korea, Rep.',
    'Kyrgyz Republic' : 'Kyrgyz Rep.',
    "Lao People's Dem. Rep." : 'Lao PDR',
    'Lesotho, Kingdom of' : 'Lesotho',
    'Marshall Islands, Rep. of the' : 'Marshall Islands',
    'Mauritania, Islamic Rep. of' : 'Mauritania',
    'Micronesia, Federated States of' : 'Micronesia',
    'Micronesia, Fed. Sts.' : 'Micronesia',
    'Moldova, Rep. of' : 'Moldova',
    'Mozambique, Rep. of' : 'Mozambique',
    'Nauru, Rep. of' : 'Nauru',
    'Netherlands, The' : 'Netherlands',
    'North Macedonia, Republic of' : 'North Macedonia',
    'Palau, Rep. of' : 'Palau',
    'Poland, Rep. of' : 'Poland',
    'Serbia, Rep. of' : 'Serbia',
    'Slovak Rep.': 'Slovak',
    'South Sudan, Rep. of' : 'South Sudan',
    'Syrian Arab Republic' : 'Syrian Arab Rep',
    'Tajikistan, Rep. of' : 'Tajikistan',
    'Tanzania, United Rep. of' : 'Tanzania',
    'Timor-Leste, Dem. Rep. of' : 'Timor-Leste',
    'Turkiye' : 'Turkey',
    "Viet Nam": "Vietnam", 
    "Türkiye": "Turkey",
    'Uzbekistan, Rep. of' : 'Uzbekistan',
    'Venezuela, Rep. Bolivariana de' : 'Venezuela',
    'West Bank and Gaza' : 'Palestine',
    'Yemen, Rep. of' : 'Yemen, Rep.',
    }

In [32]:
df_pop['country'] = df_pop['country'].replace(country_name_mapping)
df_co2['country'] = df_co2['country'].replace(country_name_mapping)

In [None]:


# Standardize country names
df_pop['country'] = df_['country'].replace({"Viet Nam": "Vietnam", "Türkiye": "Turkey"})
df_co2['country'] = df_co2['country'].replace({"Viet Nam": "Vietnam", "Türkiye": "Turkey"})

# Drop rows with regions that are not typically classified as countries
non_country_regions = [
    'American Samoa', 'Channel Islands', 'Guam', 'Isle of Man', 'Monaco',
    'Northern Mariana Islands', 'Puerto Rico', 'San Marino',
    'St. Martin (French part)', 'Virgin Islands (U.S.)', 'Not classified'
]

df_pop_long = df_pop_long[~df_pop_long['country'].isin(non_country_regions)]
df_co2 = df_co2[~df_co2['country'].isin(non_country_regions)]

# Final check for discrepancies
unique_countries_pop = set(df_pop_long['country'].unique())
unique_countries_co2 = set(df_co2['country'].unique())

# Find remaining discrepancies
final_discrepancies = unique_countries_pop.symmetric_difference(unique_countries_co2)

# Display final discrepancies (should be empty)
print(final_discrepancies)  # Should be empty

# Merge the two dataframes on country and year
data = pd.merge(df_pop_long, df_co2, on=['country', 'year'], how='inner')

# Display the first few rows of the merged dataframe
print(data.head())

# Save the merged dataframe to a CSV file
data.to_csv('merged_population_co2.csv', index=False)

NameError: name 'df_pop_long' is not defined

In [None]:
# Step 1: Clean the file by removing NA and duplicated values
data_cleaned = data.dropna().drop_duplicates()

# Standardize the country names (remove leading/trailing whitespace)
data_cleaned['country'] = data_cleaned['country'].str.strip()

# Step 2: Remove the '.0' from the end of the population numbers
data_cleaned['population'] = data_cleaned['population'].astype(str).str.replace('.0', '', regex=False).astype(int)

# Step 3: Drop the country code column
data_cleaned = data_cleaned.drop(columns=['country_code'])

# Step 4: Ensure correct data types for remaining columns
data_cleaned['year'] = data_cleaned['year'].astype(int)
data_cleaned['co2_emission'] = data_cleaned['co2_emission'].astype(float)

# Optionally, save the cleaned data to a new CSV file
data_cleaned.to_csv('cleaned_population_co2.csv', index=False)
print(data_cleaned)


                           country  year  population   co2_emission
0                            Aruba  1960       54608   11092.675000
1      Africa Eastern and Southern  1960   130692579  118545.901306
2                      Afghanistan  1960     8622466     414.371000
3       Africa Western and Central  1960    97256290    8760.463000
4                           Angola  1960     5357195     550.050000
...                            ...   ...         ...            ...
13918                        Samoa  2019      211905     300.000012
13919                  Yemen, Rep.  2019    31546691   11100.000381
13920                 South Africa  2019    58087055  439640.014648
13921                       Zambia  2019    18380477    6800.000191
13922                     Zimbabwe  2019    15354608   11760.000229

[13923 rows x 4 columns]
