# Assessment 1: Group 6


**Group Members:**

> Abrigo, Nathanael\
> Buhay, Kyle Andrei\
> Cruz, Kristel Lenci\
> Entrata, Joshua Kyle


In [66]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pycountry_convert as pc

import warnings

In [67]:
warnings.filterwarnings("ignore")

In [68]:
pd.set_option('display.max_columns', None)

## `Part 1: Data Wrangling`


### **Data Cleaning**


#### Loading Datasets


In [69]:
# Load FAO dataset
df_fao = pd.read_csv('FAO.csv', encoding='iso-8859-1')

# Load FAOSTAT dataset
df_faostat = pd.read_csv('FAOSTAT.csv')

#### Data Dictionary


`FAO Dataset`


In [70]:
df_fao.sample(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
8179,GTM,89,Guatemala,2922,Stimulants,5142,Food,1000 tonnes,15.78,-90.23,12.0,13.0,14.0,15.0,17.0,15.0,17.0,17.0,18.0,17.0,18.0,19.0,21.0,22.0,22.0,23.0,25.0,27.0,29.0,45.0,50.0,56.0,47.0,46.0,32.0,19.0,16.0,15.0,19.0,18.0,14.0,20.0,16.0,9.0,5.0,34.0,30.0,44.0,30.0,28.0,23.0,25.0,22.0,57.0,46.0,48.0,54.0,40.0,32.0,34.0,34.0,74,67
7926,GRC,84,Greece,2922,Stimulants,5142,Food,1000 tonnes,39.07,21.82,10.0,12.0,13.0,14.0,15.0,15.0,17.0,17.0,17.0,18.0,18.0,21.0,21.0,20.0,25.0,24.0,18.0,18.0,28.0,30.0,31.0,33.0,34.0,38.0,39.0,33.0,41.0,45.0,49.0,51.0,33.0,33.0,27.0,36.0,35.0,58.0,61.0,54.0,60.0,64.0,58.0,73.0,83.0,81.0,81.0,84.0,95.0,96.0,95.0,95.0,99.0,95,103
4218,CHN,41,"China, mainland",2781,"Fish, Body Oil",5521,Feed,1000 tonnes,35.86,104.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0,0


In [71]:
# Create a list of descriptions
description_list = [
    'Abbreviation of Area',
    'Full name of the country or region',
    'ISO country code or region abbreviation',
    'Numeric code for the country or region',
    'Numeric code for the item',
    'Name of the food item or product',
    'Numeric code for the element',
    'Type of data recorded',
    'Measurement unit for the recorded data',
    'Latitude coordinate of the location',
    'Longitude coordinate of the location',
]

In [72]:
# Function to generate a data dictionary
def generate_data_dictionary(df):
    data_dict = {
        'Column Name': [],
        'Data Type': [],
        'Description': []
    }
    
    for i, column in enumerate(df.columns):
        data_dict['Column Name'].append(column)
        data_dict['Data Type'].append(df[column].dtype)
        # Provide a description based on the description list or a default value
        description = description_list[i] if i < len(description_list) else 'N/A'
        # If the column name starts with 'Y' and is a year, format accordingly
        if column.startswith('Y') and column[1:].isdigit():
            description = f'Data for the year {column[1:]}.'
        data_dict['Description'].append(description)
    
    return pd.DataFrame(data_dict)

In [73]:
# Generate data dictionary
fao_data_dict = generate_data_dictionary(df_fao)

# Display data dictionary
fao_data_dict

Unnamed: 0,Column Name,Data Type,Description
0,Area Abbreviation,object,Abbreviation of Area
1,Area Code,int64,Full name of the country or region
2,Area,object,ISO country code or region abbreviation
3,Item Code,int64,Numeric code for the country or region
4,Item,object,Numeric code for the item
...,...,...,...
58,Y2009,float64,Data for the year 2009.
59,Y2010,float64,Data for the year 2010.
60,Y2011,float64,Data for the year 2011.
61,Y2012,int64,Data for the year 2012.


`FAOSTAT Dataset`


In [74]:
# Create a list of descriptions
description_mapping=[
    'Numeric Code for Domain',
    'Population Type',
    'Numeric code for the country or region',
    'Full name of the country or region',
    'Numeric code for the element',
    'Population Sexes',
    'Numeric code for the item',
    'Name of the food item or product',
    'Code for the Year',
    'Time of the Year',
    'Quantity of item',
    'Value of the data',
    'Flag',
    'Sources of item',
    'Note for item',
]

In [75]:
# Initialize a counter
j = 0

# Empty list to store the data for the DataFrame
data = []

for i in df_faostat.columns:
  data.append({'Column Name': i, 'Data Type': df_faostat[i].dtype, 'Data Description': description_mapping[j]})
  j += 1

In [76]:
# Create the DataFrame from the list of dictionaries
faostat_data_dict = pd.DataFrame(data)

# Display the data dictionary
faostat_data_dict

Unnamed: 0,Column Name,Data Type,Data Description
0,Domain Code,object,Numeric Code for Domain
1,Domain,object,Population Type
2,Area Code,int64,Numeric code for the country or region
3,Area,object,Full name of the country or region
4,Element Code,int64,Numeric code for the element
5,Element,object,Population Sexes
6,Item Code,int64,Numeric code for the item
7,Item,object,Name of the food item or product
8,Year Code,int64,Code for the Year
9,Year,int64,Time of the Year


#### Handling Missing Values


`FAO Dataset`


In [77]:
# Checking row null value per column
fao_null_values = {col: df_fao[col].isnull().sum() for col in df_fao.columns if df_fao[col].isnull().sum() > 0}

fao_null_values

{'Y1961': 3539,
 'Y1962': 3539,
 'Y1963': 3539,
 'Y1964': 3539,
 'Y1965': 3539,
 'Y1966': 3539,
 'Y1967': 3539,
 'Y1968': 3539,
 'Y1969': 3539,
 'Y1970': 3539,
 'Y1971': 3539,
 'Y1972': 3539,
 'Y1973': 3539,
 'Y1974': 3539,
 'Y1975': 3539,
 'Y1976': 3539,
 'Y1977': 3539,
 'Y1978': 3539,
 'Y1979': 3539,
 'Y1980': 3539,
 'Y1981': 3539,
 'Y1982': 3539,
 'Y1983': 3539,
 'Y1984': 3539,
 'Y1985': 3539,
 'Y1986': 3539,
 'Y1987': 3539,
 'Y1988': 3539,
 'Y1989': 3539,
 'Y1990': 3415,
 'Y1991': 3415,
 'Y1992': 987,
 'Y1993': 612,
 'Y1994': 612,
 'Y1995': 612,
 'Y1996': 612,
 'Y1997': 612,
 'Y1998': 612,
 'Y1999': 612,
 'Y2000': 349,
 'Y2001': 349,
 'Y2002': 349,
 'Y2003': 349,
 'Y2004': 349,
 'Y2005': 349,
 'Y2006': 104,
 'Y2007': 104,
 'Y2008': 104,
 'Y2009': 104,
 'Y2010': 104,
 'Y2011': 104}

In [78]:
df_fao[df_fao.isnull().any(axis=1)].head(1)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
679,ARM,1,Armenia,2511,Wheat and products,5521,Feed,1000 tonnes,40.07,45.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.0,21.0,24.0,25.0,10.0,32.0,58.0,55.0,78.0,20.0,20.0,42.0,69.0,59.0,46.0,67.0,57.0,56.0,61.0,65.0,92,93


In [79]:
print(f'Number of rows with null values: {len(df_fao[df_fao.isnull().any(axis=1)])}')

Number of rows with null values: 3539


`FAOSTAT Dataset`


In [80]:
faostat_null_values = {col: df_faostat[col].isnull().sum() for col in df_faostat.columns if df_faostat[col].isnull().sum() > 0}

faostat_null_values

{}

**Explanation & Justification:**

We handled missing data in the FAO and FAOSTAT dataset specifically for the years 1961 - 1991 by retaining the null values rather than replacing them with -1. Since the "y{YEAR_NUM}" columns are the only ones with null values, it is acceptable to leave these nulls as they are. Replacing these nulls with -1 could affect the summarization of statistics and affect the accuracy of visualizations.

Retaining null values preserves the integrity of our statistical summaries, such as mean and standard deviation calculations, which could be skewed by arbitrary replacement values like -1. By keeping nulls, we avoid misrepresenting missing data as zeroes, thus ensuring that our plots and analyses accurately reflect the data's true nature and maintain clarity in our visualizationons.


#### Removing & Checking Duplicates


`FAO Dataset`


In [81]:
# Duplicates in FAO
dupe_fao_sum= df_fao.duplicated().sum()

print(f'Total number of duplicated rows in FAO: {dupe_fao_sum}')

Total number of duplicated rows in FAO: 0


`FAOSTAT Dataset`


In [82]:
# Duplicates in FAOSTAT
dupe_faostat_sum = df_faostat.duplicated().sum()

print(f'Total number of duplicated rows in FAOSTAT: {dupe_faostat_sum}')

Total number of duplicated rows in FAOSTAT: 0


#### Standardize Column Names


`FAO Dataset`


In [83]:
df_fao.columns = (
    df_fao.columns
    .str.lower()                                 # Convert to lowercase
    .str.replace(' ', '_')                       # Replace spaces with underscores
    .str.replace(r'[^a-z0-9_]', '', regex=True)  # Remove special characters
)

`FAOSTAT Dataset`


In [84]:
df_faostat.columns = (
    df_faostat.columns
    .str.lower()                
    .str.replace(' ', '_')      
    .str.replace(r'[^a-z0-9_]', '', regex=True)  
)

#### Rename Similar Column Names


In [85]:
# Rename the columns
df_faostat.rename(columns={
    'item': 'population_item',
    'item_code': 'population_item_code',
    'element': 'population_element',
    'element_code': 'population_element_code',
    'unit': 'population_unit'
}, inplace=True)

df_faostat.columns

Index(['domain_code', 'domain', 'area_code', 'area', 'population_element_code',
       'population_element', 'population_item_code', 'population_item',
       'year_code', 'year', 'population_unit', 'value', 'flag',
       'flag_description', 'note'],
      dtype='object')

### **Data Merging**


In [86]:
df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area']
).sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
19473,TKM,213,Turkmenistan,2912,Treenuts,5142,Food,1000 tonnes,38.97,59.56,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2,2,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,5758.075,X,International reliable sources,"UNDESA, Population Division – World Population..."
20115,TZA,215,United Republic of Tanzania,2766,Cephalopods,5142,Food,1000 tonnes,-6.37,34.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,57310.019,X,International reliable sources,"UNDESA, Population Division – World Population..."
5262,CYP,50,Cyprus,2781,"Fish, Body Oil",5142,Food,1000 tonnes,35.13,33.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,1179.551,X,International reliable sources,"UNDESA, Population Division – World Population..."


In [87]:
len_merged_df = len(
    df_fao.merge(
        df_faostat, 
        how='inner', 
        on=['area_code', 'area']
    )
)

print(f'Number of rows of FAO dataframe: {len(df_fao)}')
print(f'Number of rows of FAOSTAT dataframe: {len(df_faostat)}')
print(f'Number of rows of merged dataframe: {len_merged_df}')

Number of rows of FAO dataframe: 21477
Number of rows of FAOSTAT dataframe: 231
Number of rows of merged dataframe: 21230


**Explanation & Justification:**

> We merged the two dataframes on both 'area_code' and 'area' not only because these columns are present in both datasets, but also provide a more unique identifier for each row in the merged result.

> We chose an _inner join_ for this merge to avoid introducing null values, since there are combinations of 'area_code' and 'area' that do not exist in the `FAO` and `FAOSTAT` dataframes.

> An inner join allows us to retain only the rows where there is a match in both datasets based on the specified columns. This is crucial for ensuring data consistency and accuracy, as it filters out any records that do not have corresponding matches in both dataframes.

> Unlike other types of joins, such as left or right joins, which would include unmatched records from one side or the other, resulting in null values, the inner join provides a focused dataset with only the relevant records that exist in both sources. This approach not only enhances the accuracy of the match but also improves the uniqueness and reliability of each row in the final merged dataset.


#### Create New Dataframe


In [88]:
merged_df = df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area'])

merged_df.sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
8783,HUN,97,Hungary,2531,Potatoes and products,5521,Feed,1000 tonnes,47.16,19.5,812.0,553.0,532.0,517.0,388.0,527.0,561.0,437.0,485.0,512.0,465.0,398.0,386.0,448.0,445.0,416.0,461.0,524.0,583.0,450.0,392.0,404.0,366.0,332.0,431.0,468.0,355.0,316.0,257.0,202.0,210.0,221.0,184.0,119.0,123.0,175.0,173.0,143.0,137.0,92.0,91.0,75.0,73.0,45.0,43.0,39.0,21.0,19.0,25.0,19.0,13.0,13,13,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,9721.559,X,International reliable sources,"UNDESA, Population Division – World Population..."
4473,COL,44,Colombia,2625,"Fruits, Other",5142,Food,1000 tonnes,4.57,-74.3,79.0,81.0,83.0,86.0,87.0,90.0,92.0,94.0,96.0,98.0,101.0,105.0,107.0,110.0,116.0,110.0,124.0,134.0,139.0,147.0,139.0,144.0,142.0,124.0,147.0,154.0,170.0,223.0,469.0,352.0,562.0,950.0,714.0,727.0,853.0,948.0,1091.0,1060.0,1234.0,1340.0,1365.0,1451.0,1555.0,1562.0,1631.0,1840.0,1905.0,1835.0,1901.0,1822.0,1830.0,2000,2164,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,49065.615,X,International reliable sources,"UNDESA, Population Division – World Population..."
20273,USA,231,United States of America,2922,Stimulants,5142,Food,1000 tonnes,37.09,-95.71,1715.0,1749.0,1756.0,1770.0,1778.0,1770.0,1812.0,1856.0,1785.0,1754.0,1723.0,1845.0,1836.0,1719.0,1643.0,1756.0,1406.0,1555.0,1621.0,1539.0,1545.0,1536.0,1525.0,1685.0,1724.0,1733.0,1743.0,1708.0,1813.0,1904.0,1851.0,1928.0,1927.0,1715.0,1717.0,1855.0,1786.0,1966.0,2048.0,2115.0,1990.0,1904.0,2143.0,2179.0,2378.0,2430.0,2226.0,2194.0,2248.0,2260.0,2345.0,2263,2378,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,324459.463,X,International reliable sources,"UNDESA, Population Division – World Population..."


### **Feature Engineering**


##### Column: Years Existing


In [89]:
year_cols = [col for col in merged_df.columns if col.startswith('y') and col[1:].isdigit()]
merged_df['years_existing'] = merged_df[year_cols].notnull().sum(axis=1)

merged_df[['area_code', 'area', 'years_existing']].sample(3)

Unnamed: 0,area_code,area,years_existing
8921,99,Iceland,53
5418,167,Czechia,21
11416,123,Liberia,53


**Explanation & Justification:**

> We created a new column for _years existing_ to count the number of years for which valid (non-negative) data exists per record in the dataset.


##### Column: Average Production


In [90]:
merged_df['average_production'] = merged_df[year_cols].mean(axis=1)

merged_df[['area_code', 'area', 'years_existing', 'average_production']].sample(3)

Unnamed: 0,area_code,area,years_existing,average_production
13844,153,New Caledonia,53,2.773585
19529,226,Uganda,53,1.849057
17780,202,South Africa,53,7.433962


**Explanation & Justification:**

> We created a new column for _average production_ to calculate the mean production over a specified range of years within the dataset. It summarizes production levels over time per record.


##### Column: Value per Capita


In [91]:
merged_df['population_unit'] = merged_df['population_unit'].str.extract('(\d+)').astype(int)

merged_df['population_unit'].unique()


array([1000])

In [92]:
merged_df[['population_unit', 'value']].sample(3)

Unnamed: 0,population_unit,value
20720,1000,31977.065
12061,1000,31624.264
9564,1000,4761.657


In [93]:
merged_df['value_per_capita'] = merged_df['value'] / merged_df['population_unit']

merged_df[['value', 'population_unit', 'value_per_capita']].sample(3)

Unnamed: 0,value,population_unit,value_per_capita
2505,2291.661,1000,2.291661
10279,9702.353,1000,9.702353
19417,5758.075,1000,5.758075


**Explanation & Justification:**

> We created a new column for _value per capita_ by dividing the `value` by the `population_unit`.


#### Column: Continent


In [94]:
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    except Exception as e:
        return 'Invalid'
    
    return country_continent_name

merged_df['continent'] = merged_df['area'].apply(country_to_continent)
merged_df[['area', 'continent']].drop_duplicates()

Unnamed: 0,area,continent
0,Afghanistan,Asia
83,Albania,Europe
206,Algeria,Africa
330,Angola,Africa
439,Antigua and Barbuda,North America
...,...,...
20631,Venezuela (Bolivarian Republic of),Invalid
20761,Viet Nam,Asia
20870,Yemen,Asia
20989,Zambia,Africa


In [95]:
merged_df[merged_df['continent'] == 'Invalid'][['area', 'continent']].drop_duplicates()

Unnamed: 0,area,continent
2162,Bolivia (Plurinational State of),Invalid
3866,"China, Hong Kong SAR",Invalid
3999,"China, Macao SAR",Invalid
4120,"China, mainland",Invalid
4266,"China, Taiwan Province of",Invalid
9293,Iran (Islamic Republic of),Invalid
15658,Republic of Korea,Invalid
18831,Timor-Leste,Invalid
20631,Venezuela (Bolivarian Republic of),Invalid


In [96]:
invalid_continent_map = {
    'Bolivia (Plurinational State of)': 'South America',
    'China, Hong Kong SAR': 'Asia',
    'China, Macao SAR': 'Asia',
    'China, mainland': 'Asia',
    'China, Taiwan Province of': 'Asia',
    'Iran (Islamic Republic of)': 'Asia',
    'Republic of Korea': 'Asia',
    'Timor-Leste': 'Asia',
    'Venezuela (Bolivarian Republic of)': 'South America'
}

merged_df.loc[merged_df['area'].isin(invalid_continent_map.keys()), 'continent'] = merged_df['area'].map(invalid_continent_map)

merged_df[merged_df['area'].isin(invalid_continent_map.keys())][['area', 'continent']].drop_duplicates()

Unnamed: 0,area,continent
2162,Bolivia (Plurinational State of),South America
3866,"China, Hong Kong SAR",Asia
3999,"China, Macao SAR",Asia
4120,"China, mainland",Asia
4266,"China, Taiwan Province of",Asia
9293,Iran (Islamic Republic of),Asia
15658,Republic of Korea,Asia
18831,Timor-Leste,Asia
20631,Venezuela (Bolivarian Republic of),South America


In [97]:
merged_df[merged_df['continent'] == 'Invalid'][['area', 'continent']].drop_duplicates()

Unnamed: 0,area,continent


### **Final Merged Dataframe**


In [98]:
merged_df.sample(5)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita,continent
9561,IRL,104,Ireland,2547,Peas,5521,Feed,1000 tonnes,53.41,-8.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,20.0,15.0,30.0,20.0,35.0,22.0,20.0,15.0,17.0,50.0,14.0,17.0,37.0,26.0,14.0,14.0,10.0,15.0,10.0,1.0,1.0,1.0,1.0,1.0,1,1,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,4761.657,X,International reliable sources,"UNDESA, Population Division – World Population...",53,7.90566,4.761657,Europe
20704,VEN,236,Venezuela (Bolivarian Republic of),2656,Beer,5142,Food,1000 tonnes,6.42,-66.59,241.0,248.0,249.0,263.0,278.0,297.0,332.0,356.0,394.0,496.0,437.0,450.0,460.0,458.0,426.0,700.0,800.0,902.0,1001.0,1107.0,1200.0,1110.0,1168.0,1184.0,935.0,1071.0,1261.0,1461.0,1105.0,1251.0,1379.0,1616.0,1618.0,1522.0,1535.0,1494.0,1727.0,1910.0,1726.0,1855.0,2044.0,1702.0,1735.0,1989.0,2187.0,2388.0,2509.0,2441.0,2307.0,2199.0,1966.0,2099,1982,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,31977.065,X,International reliable sources,"UNDESA, Population Division – World Population...",53,1237.188679,31.977065,South America
19046,TTO,220,Trinidad and Tobago,2532,Cassava and products,5521,Feed,1000 tonnes,10.69,-61.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,1369.125,X,International reliable sources,"UNDESA, Population Division – World Population...",53,0.0,1.369125,North America
19677,UKR,230,Ukraine,2617,Apples and products,5521,Feed,1000 tonnes,48.38,31.17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,44.0,109.0,72.0,71.0,65.0,112.0,70.0,29.0,47.0,33.0,40.0,45.0,40.0,38.0,30.0,37.0,35.0,40.0,45.0,90.0,100,163,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,44222.947,X,International reliable sources,"UNDESA, Population Division – World Population...",22,61.590909,44.222947,Europe
18601,TJK,208,Tajikistan,2513,Barley and products,5521,Feed,1000 tonnes,38.86,71.28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,101.0,16.0,4.0,7.0,1.0,5.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,21.0,33.0,25.0,63.0,59.0,65.0,66,74,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,8921.343,X,International reliable sources,"UNDESA, Population Division – World Population...",22,25.409091,8.921343,Asia


In [99]:
# Check if there are rows with null values (excluding the year columns)

non_year_columns = merged_df.columns.difference(year_cols)

merged_df[merged_df[non_year_columns].isnull().any(axis=1)].head(5)


Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita,continent


### **Export to CSV**


In [100]:
merged_df.to_csv('merged_df.csv', index=False)