# Assessment 1: Group 6


**Group Members:**

> Abrigo, Nathanael\
> Buhay, Kyle Andrei\
> Cruz, Kristel Lenci\
> Entrata, Joshua Kyle


TEST PR TEST PR

In [488]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pycountry
import pycountry_convert as pc

import warnings

In [489]:
warnings.filterwarnings("ignore")

In [490]:
pd.set_option('display.max_columns', None)

## `Part 1: Data Wrangling`


### **Data Cleaning**


#### Loading Datasets


In [491]:
# Load FAO dataset
df_fao = pd.read_csv('FAO.csv', encoding='iso-8859-1')

# Load FAOSTAT dataset
df_faostat = pd.read_csv('FAOSTAT.csv')

#### Data Dictionary


`FAO Dataset`


In [492]:
df_fao.sample(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
6646,ETH,238,Ethiopia,2948,Milk - Excluding Butter,5521,Feed,1000 tonnes,9.15,40.49,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10.0,10.0,10.0,11.0,11.0,12.0,12.0,16.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18,18
17303,SLE,197,Sierra Leone,2922,Stimulants,5142,Food,1000 tonnes,8.46,-11.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,5.0,6.0,6.0,7.0,8.0,8.0,8.0,8.0,7.0,9.0,9.0,9.0,10.0,8.0,7.0,6.0,6.0,7.0,7.0,8.0,7.0,8.0,8.0,9.0,10.0,10.0,10.0,10,11
19309,TTO,220,Trinidad and Tobago,2558,Rape and Mustardseed,5142,Food,1000 tonnes,10.69,-61.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [493]:
# Create a list of descriptions
description_list = [
    'Abbreviation of Area',
    'Full name of the country or region',
    'ISO country code or region abbreviation',
    'Numeric code for the country or region',
    'Numeric code for the item',
    'Name of the food item or product',
    'Numeric code for the element',
    'Type of data recorded',
    'Measurement unit for the recorded data',
    'Latitude coordinate of the location',
    'Longitude coordinate of the location',
]

In [494]:
# Function to generate a data dictionary
def generate_data_dictionary(df):
    data_dict = {
        'Column Name': [],
        'Data Type': [],
        'Description': []
    }
    
    for i, column in enumerate(df.columns):
        data_dict['Column Name'].append(column)
        data_dict['Data Type'].append(df[column].dtype)
        # Provide a description based on the description list or a default value
        description = description_list[i] if i < len(description_list) else 'N/A'
        # If the column name starts with 'Y' and is a year, format accordingly
        if column.startswith('Y') and column[1:].isdigit():
            description = f'Data for the year {column[1:]}.'
        data_dict['Description'].append(description)
    
    return pd.DataFrame(data_dict)

In [495]:
# Generate data dictionary
fao_data_dict = generate_data_dictionary(df_fao)

# Display data dictionary
fao_data_dict

Unnamed: 0,Column Name,Data Type,Description
0,Area Abbreviation,object,Abbreviation of Area
1,Area Code,int64,Full name of the country or region
2,Area,object,ISO country code or region abbreviation
3,Item Code,int64,Numeric code for the country or region
4,Item,object,Numeric code for the item
...,...,...,...
58,Y2009,float64,Data for the year 2009.
59,Y2010,float64,Data for the year 2010.
60,Y2011,float64,Data for the year 2011.
61,Y2012,int64,Data for the year 2012.


`FAOSTAT Dataset`


In [496]:
# Create a list of descriptions
description_mapping=[
    "Numeric Code for Domain",
    "Population Type",
    "Numeric code for the country or region",
    "Full name of the country or region",
    "Numeric code for the element",
    "Element Type",
    "Numeric code for the item",
    "Object of focus",
    "Code for the Year",
    "Time of the Year",
    "Quantity of item",
    "Value of the item",
    "Flag",
    "Sources of item",
    "Note for item",
]

In [497]:
# Initialize a counter
j = 0

# Empty list to store the data for the DataFrame
data = []

for i in df_faostat.columns:
  data.append({'Column Name': i, 'Data Type': df_faostat[i].dtype, 'Data Description': description_mapping[j]})
  j += 1

In [498]:
# Create the DataFrame from the list of dictionaries
faostat_data_dict = pd.DataFrame(data)

# Display the data dictionary
faostat_data_dict

Unnamed: 0,Column Name,Data Type,Data Description
0,Domain Code,object,Numeric Code for Domain
1,Domain,object,Population Type
2,Area Code,int64,Numeric code for the country or region
3,Area,object,Full name of the country or region
4,Element Code,int64,Numeric code for the element
5,Element,object,Population Sexes
6,Item Code,int64,Numeric code for the item
7,Item,object,Name of the food item or product
8,Year Code,int64,Code for the Year
9,Year,int64,Time of the Year


#### Handling Missing Values


`FAO Dataset`


In [499]:
# Checking row null value per column
fao_null_values = {col: df_fao[col].isnull().sum() for col in df_fao.columns if df_fao[col].isnull().sum() > 0}

fao_null_values

{'Y1961': 3539,
 'Y1962': 3539,
 'Y1963': 3539,
 'Y1964': 3539,
 'Y1965': 3539,
 'Y1966': 3539,
 'Y1967': 3539,
 'Y1968': 3539,
 'Y1969': 3539,
 'Y1970': 3539,
 'Y1971': 3539,
 'Y1972': 3539,
 'Y1973': 3539,
 'Y1974': 3539,
 'Y1975': 3539,
 'Y1976': 3539,
 'Y1977': 3539,
 'Y1978': 3539,
 'Y1979': 3539,
 'Y1980': 3539,
 'Y1981': 3539,
 'Y1982': 3539,
 'Y1983': 3539,
 'Y1984': 3539,
 'Y1985': 3539,
 'Y1986': 3539,
 'Y1987': 3539,
 'Y1988': 3539,
 'Y1989': 3539,
 'Y1990': 3415,
 'Y1991': 3415,
 'Y1992': 987,
 'Y1993': 612,
 'Y1994': 612,
 'Y1995': 612,
 'Y1996': 612,
 'Y1997': 612,
 'Y1998': 612,
 'Y1999': 612,
 'Y2000': 349,
 'Y2001': 349,
 'Y2002': 349,
 'Y2003': 349,
 'Y2004': 349,
 'Y2005': 349,
 'Y2006': 104,
 'Y2007': 104,
 'Y2008': 104,
 'Y2009': 104,
 'Y2010': 104,
 'Y2011': 104}

In [500]:
df_fao[df_fao.isnull().any(axis=1)].head(1)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
679,ARM,1,Armenia,2511,Wheat and products,5521,Feed,1000 tonnes,40.07,45.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.0,21.0,24.0,25.0,10.0,32.0,58.0,55.0,78.0,20.0,20.0,42.0,69.0,59.0,46.0,67.0,57.0,56.0,61.0,65.0,92,93


In [501]:
print(f'Number of rows with null values: {len(df_fao[df_fao.isnull().any(axis=1)])}')

Number of rows with null values: 3539


`FAOSTAT Dataset`


In [502]:
faostat_null_values = {col: df_faostat[col].isnull().sum() for col in df_faostat.columns if df_faostat[col].isnull().sum() > 0}

faostat_null_values

{}

**Explanation & Justification:**

We handled missing data in the FAO and FAOSTAT dataset specifically for the years 1961 - 1991 by retaining the null values rather than replacing them with -1. Since the "y{YEAR_NUM}" columns are the only ones with null values, it is acceptable to leave these nulls as they are. Replacing these nulls with -1 could affect the summarization of statistics and affect the accuracy of visualizations.

Retaining null values preserves the integrity of our statistical summaries, such as mean and standard deviation calculations, which could be skewed by arbitrary replacement values like -1. By keeping nulls, we avoid misrepresenting missing data as zeroes, thus ensuring that our plots and analyses accurately reflect the data's true nature and maintain clarity in our visualizationons.


#### Removing & Checking Duplicates


`FAO Dataset`


In [503]:
# Duplicates in FAO
dupe_fao_sum= df_fao.duplicated().sum()

print(f'Total number of duplicated rows in FAO: {dupe_fao_sum}')

Total number of duplicated rows in FAO: 0


`FAOSTAT Dataset`


In [504]:
# Duplicates in FAOSTAT
dupe_faostat_sum = df_faostat.duplicated().sum()

print(f'Total number of duplicated rows in FAOSTAT: {dupe_faostat_sum}')

Total number of duplicated rows in FAOSTAT: 0


#### Standardize Column Names


`FAO Dataset`


In [505]:
df_fao.columns = (
    df_fao.columns
    .str.lower()                                 # Convert to lowercase
    .str.replace(' ', '_')                       # Replace spaces with underscores
    .str.replace(r'[^a-z0-9_]', '', regex=True)  # Remove special characters
)

`FAOSTAT Dataset`


In [506]:
df_faostat.columns = (
    df_faostat.columns
    .str.lower()                
    .str.replace(' ', '_')      
    .str.replace(r'[^a-z0-9_]', '', regex=True)  
)

#### Rename Similar Column Names


In [507]:
# Rename the columns
df_faostat.rename(columns={
    'item': 'population_item',
    'item_code': 'population_item_code',
    'element': 'population_element',
    'element_code': 'population_element_code',
    'unit': 'population_unit'
}, inplace=True)

df_faostat.columns

Index(['domain_code', 'domain', 'area_code', 'area', 'population_element_code',
       'population_element', 'population_item_code', 'population_item',
       'year_code', 'year', 'population_unit', 'value', 'flag',
       'flag_description', 'note'],
      dtype='object')

### **Data Merging**


In [508]:
df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area']
).sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
221,DZA,4,Algeria,2520,"Cereals, Other",5142,Food,1000 tonnes,28.03,1.66,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,13.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,15.0,0.0,29.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1,1,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,41318.142,X,International reliable sources,"UNDESA, Population Division – World Population..."
17039,SEN,195,Senegal,2781,"Fish, Body Oil",5142,Food,1000 tonnes,14.5,-14.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,15850.567,X,International reliable sources,"UNDESA, Population Division – World Population..."
13828,NCL,153,New Caledonia,2513,Barley and products,5142,Food,1000 tonnes,-20.9,165.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,276.255,X,International reliable sources,"UNDESA, Population Division – World Population..."


In [509]:
len_merged_df = len(
    df_fao.merge(
        df_faostat, 
        how='inner', 
        on=['area_code', 'area']
    )
)

print(f'Number of rows of FAO dataframe: {len(df_fao)}')
print(f'Number of rows of FAOSTAT dataframe: {len(df_faostat)}')
print(f'Number of rows of merged dataframe: {len_merged_df}')

Number of rows of FAO dataframe: 21477
Number of rows of FAOSTAT dataframe: 231
Number of rows of merged dataframe: 21230


**Explanation & Justification:**

> We merged the two dataframes on both 'area_code' and 'area' not only because these columns are present in both datasets, but also provide a more unique identifier for each row in the merged result.

> We chose an _inner join_ for this merge to avoid introducing null values, since there are combinations of 'area_code' and 'area' that do not exist in the `FAO` and `FAOSTAT` dataframes.

> An inner join allows us to retain only the rows where there is a match in both datasets based on the specified columns. This is crucial for ensuring data consistency and accuracy, as it filters out any records that do not have corresponding matches in both dataframes.

> Unlike other types of joins, such as left or right joins, which would include unmatched records from one side or the other, resulting in null values, the inner join provides a focused dataset with only the relevant records that exist in both sources. This approach not only enhances the accuracy of the match but also improves the uniqueness and reliability of each row in the final merged dataset.


#### Create New Dataframe


In [510]:
merged_df = df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area'])

merged_df.sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
19942,GBR,229,United Kingdom,2617,Apples and products,5142,Food,1000 tonnes,55.38,-3.44,529.0,772.0,703.0,829.0,721.0,592.0,579.0,637.0,688.0,799.0,740.0,597.0,844.0,714.0,723.0,740.0,628.0,721.0,771.0,728.0,685.0,793.0,883.0,871.0,905.0,914.0,950.0,964.0,1119.0,1031.0,993.0,1004.0,1103.0,1033.0,984.0,853.0,863.0,1008.0,1064.0,945.0,1112.0,1138.0,1341.0,1541.0,1794.0,1938.0,1852.0,1771.0,1719.0,1746.0,1720.0,1750,1705,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,66181.585,X,International reliable sources,"UNDESA, Population Division – World Population..."
16828,SAU,194,Saudi Arabia,2511,Wheat and products,5521,Feed,1000 tonnes,23.89,45.08,8.0,8.0,9.0,9.0,11.0,11.0,14.0,8.0,10.0,13.0,6.0,7.0,7.0,11.0,7.0,6.0,9.0,9.0,22.0,16.0,31.0,49.0,63.0,79.0,110.0,119.0,138.0,171.0,179.0,185.0,208.0,211.0,173.0,132.0,82.0,60.0,90.0,87.0,104.0,90.0,104.0,122.0,126.0,139.0,133.0,132.0,128.0,112.0,123.0,148.0,163.0,155,139,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,32938.213,X,International reliable sources,"UNDESA, Population Division – World Population..."
5047,CUB,49,Cuba,2517,Millet and products,5521,Feed,1000 tonnes,21.52,-77.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,11484.636,X,International reliable sources,"UNDESA, Population Division – World Population..."


### **Feature Engineering**


##### Column: Years Existing


In [511]:
year_cols = [col for col in merged_df.columns if col.startswith('y') and col[1:].isdigit()]
merged_df['years_existing'] = merged_df[year_cols].notnull().sum(axis=1)

merged_df[['area_code', 'area', 'years_existing']].sample(3)

Unnamed: 0,area_code,area,years_existing
13649,149,Nepal,53
6302,60,El Salvador,53
7925,84,Greece,53


**Explanation & Justification:**

> We created a new column for _years existing_ to count the number of years for which valid (non-negative) data exists per record in the dataset.


##### Column: Average Production


In [512]:
merged_df['average_production'] = merged_df[year_cols].mean(axis=1)

merged_df[['area_code', 'area', 'years_existing', 'average_production']].sample(3)

Unnamed: 0,area_code,area,years_existing,average_production
16318,188,Saint Kitts and Nevis,53,0.792453
14712,221,Oman,24,0.0
14895,166,Panama,53,12.603774


**Explanation & Justification:**

> We created a new column for _average production_ to calculate the mean production over a specified range of years within the dataset. It summarizes production levels over time per record.


##### Column: Value per Capita


In [513]:
merged_df['population_unit'] = merged_df['population_unit'].str.extract('(\d+)').astype(int)

merged_df['population_unit'].unique()


array([1000])

In [514]:
merged_df[['population_unit', 'value']].sample(3)

Unnamed: 0,population_unit,value
4897,1000,24294.75
13439,1000,53370.609
12223,1000,436.33


In [515]:
merged_df['value_per_capita'] = merged_df['value'] / merged_df['population_unit']

merged_df[['value', 'population_unit', 'value_per_capita']].sample(3)

Unnamed: 0,value,population_unit,value_per_capita
18747,69037.513,1000,69.037513
10836,6045.117,1000,6.045117
18485,8476.005,1000,8.476005


**Explanation & Justification:**

> We created a new column for _value per capita_ by dividing the `value` by the `population_unit`.


#### Column: ISO Alpha-3 Country Code


In [516]:
# Create a function to get the iso_alpha_3 of a country using the PyCountry library

def get_iso_alpha3(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except:
        return 'INVALID'
    
merged_df['iso_alpha3'] = merged_df['area'].apply(get_iso_alpha3)
merged_df[['area', 'iso_alpha3']].drop_duplicates().head(3)

Unnamed: 0,area,iso_alpha3
0,Afghanistan,AFG
83,Albania,ALB
206,Algeria,DZA


In [517]:
# Check if there are areas that has 'INVALID' iso_alpha3
merged_df[merged_df['iso_alpha3'] == 'INVALID'][['area', 'iso_alpha3']].drop_duplicates()

Unnamed: 0,area,iso_alpha3
2162,Bolivia (Plurinational State of),INVALID
3866,"China, Hong Kong SAR",INVALID
3999,"China, Macao SAR",INVALID
4120,"China, mainland",INVALID
4266,"China, Taiwan Province of",INVALID
9293,Iran (Islamic Republic of),INVALID
15658,Republic of Korea,INVALID
19275,Turkey,INVALID
20631,Venezuela (Bolivarian Republic of),INVALID


In [518]:
# Manually set the ISO alpha-3 of these countries 
manual_iso_mapping = {
    'Bolivia (Plurinational State of)': 'BOL',
    'China, Hong Kong SAR': 'HKG',
    'China, Macao SAR': 'MAC',
    'China, mainland': 'CHN',
    'China, Taiwan Province of': 'TWN',
    'Iran (Islamic Republic of)': 'IRN',
    'Republic of Korea': 'KOR',
    'Turkey': 'TUR',
    'Venezuela (Bolivarian Republic of)': 'VEN',
}

merged_df['iso_alpha3'] = merged_df['area'].map(manual_iso_mapping).fillna(merged_df['iso_alpha3'])

merged_df[merged_df['iso_alpha3'] == 'INVALID'][['area', 'iso_alpha3']].drop_duplicates()

Unnamed: 0,area,iso_alpha3


In [519]:
merged_df[['area', 'iso_alpha3']].drop_duplicates().sample(3)

Unnamed: 0,area,iso_alpha3
14334,Nigeria,NGA
20288,Uruguay,URY
10609,Kiribati,KIR


**Explanation & Justification:**

> We created a new column for _iso_alpha3_ using `pycountry_convert` library to assign continent for each country. In the case of some areas (like Timor-Leste), we manually assigned its continent. This column will be useful for future analysis based on continents.


#### Column: Continent


In [520]:
def country_to_continent(country_alpha3):
    try:
        country_alpha2 = pc.country_alpha3_to_country_alpha2(country_alpha3)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        return 'Invalid'
    
    return country_continent_name

merged_df['continent'] = merged_df['iso_alpha3'].apply(country_to_continent)
merged_df[['area', 'iso_alpha3', 'continent']].drop_duplicates().sample(3)

Unnamed: 0,area,iso_alpha3,continent
4788,Côte d'Ivoire,CIV,Africa
11051,Latvia,LVA,Europe
13329,Myanmar,MMR,Asia


In [521]:
merged_df[merged_df['continent'] == 'Invalid'][['area', 'iso_alpha3', 'continent']].drop_duplicates()

Unnamed: 0,area,iso_alpha3,continent
18831,Timor-Leste,TLS,Invalid


In [522]:
# Manually assign continent for Timor-Leste
merged_df.loc[merged_df['iso_alpha3'] == 'TLS', 'continent'] = 'Asia'

merged_df[merged_df['continent'] == 'Invalid'][['area', 'iso_alpha3', 'continent']].drop_duplicates()

Unnamed: 0,area,iso_alpha3,continent


**Explanation & Justification:**

> We created a new column for _continent_ using `pycountry` library to assign ISO alpha-3 for each country. In the case of some areas that were not recognized by the library, we manually assigned their respective ISO alpha-3 based on our research for those countries. This column will be essential when creating a choropleth maps.


### **Final Merged Dataframe**


In [523]:
merged_df.sample(5)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita,iso_alpha3,continent
9473,IRQ,103,Iraq,2642,Cloves,5142,Food,1000 tonnes,33.22,43.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,38274.618,X,International reliable sources,"UNDESA, Population Division – World Population...",53,0.0,38.274618,IRQ,Asia
10981,LAO,120,Lao People's Democratic Republic,2601,Tomatoes and products,5142,Food,1000 tonnes,19.86,102.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,6858.16,X,International reliable sources,"UNDESA, Population Division – World Population...",53,0.0,6.85816,LAO,Asia
11748,LUX,256,Luxembourg,2945,Offals,5142,Food,1000 tonnes,49.82,6.13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2,2,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,583.455,X,International reliable sources,"UNDESA, Population Division – World Population...",14,2.642857,0.583455,LUX,Europe
8117,GTM,89,Guatemala,2612,"Lemons, Limes and products",5142,Food,1000 tonnes,15.78,-90.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,70.0,75.0,80.0,85.0,90.0,100.0,109.0,115.0,118.0,118.0,119.0,122.0,122.0,122.0,123.0,123.0,128.0,130.0,126.0,127.0,129.0,141.0,97.0,100.0,120.0,97.0,100.0,102.0,113.0,110.0,111.0,114,113,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,16913.503,X,International reliable sources,"UNDESA, Population Division – World Population...",53,70.075472,16.913503,GTM,North America
9466,IRQ,103,Iraq,2620,Grapes and products (excl wine),5142,Food,1000 tonnes,33.22,43.68,46.0,46.0,45.0,46.0,45.0,64.0,63.0,68.0,68.0,69.0,71.0,114.0,154.0,208.0,251.0,276.0,317.0,363.0,256.0,140.0,193.0,212.0,225.0,386.0,398.0,385.0,368.0,393.0,385.0,402.0,342.0,365.0,373.0,254.0,270.0,279.0,270.0,275.0,247.0,249.0,270.0,316.0,318.0,26.0,178.0,208.0,166.0,183.0,176.0,252.0,204.0,218,243,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,38274.618,X,International reliable sources,"UNDESA, Population Division – World Population...",53,221.490566,38.274618,IRQ,Asia


In [526]:
# Check if there are rows with null values (excluding the year columns)
non_year_columns = merged_df.columns.difference(year_cols)
merged_df[merged_df[non_year_columns].isnull().any(axis=1)].head(5)


Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita,iso_alpha3,continent


**Explanation & Justification:**

> We created a new column for _value per capita_ by dividing the `value` by the `population_unit`.


### **Export to CSV**


In [525]:
merged_df.to_csv('merged_df.csv', index=False)