# Assessment 1: Group 6


**Group Members:**

> Abrigo, Nathanael\
> Buhay, Kyle Andrei\
> Cruz, Kristel Lenci\
> Entrata, Joshua Kyle


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
pd.set_option('display.max_columns', None)

## `Part 1: Data Wrangling`


### **Data Cleaning**


#### Loading Datasets


In [4]:
# Load FAO dataset
df_fao = pd.read_csv('FAO.csv', encoding='iso-8859-1')

# Load FAOSTAT dataset
df_faostat = pd.read_csv('FAOSTAT.csv')

#### Data Dictionary


`FAO Dataset`


In [5]:
df_fao.sample(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
9965,JAM,109,Jamaica,2532,Cassava and products,5142,Food,1000 tonnes,18.11,-77.3,3.0,3.0,3.0,3.0,3.0,4.0,5.0,3.0,4.0,7.0,6.0,7.0,5.0,5.0,7.0,7.0,10.0,11.0,10.0,8.0,8.0,6.0,6.0,7.0,6.0,6.0,6.0,5.0,4.0,4.0,4.0,5.0,7.0,7.0,6.0,7.0,5.0,5.0,6.0,5.0,5.0,4.0,6.0,6.0,5.0,6.0,6.0,5.0,5.0,6.0,7.0,6,6
8870,HUN,97,Hungary,2764,"Marine Fish, Other",5142,Food,1000 tonnes,47.16,19.5,0.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,3.0,6.0,4.0,4.0,4.0,6.0,6.0,7.0,4.0,3.0,3.0,6.0,5.0,7.0,5.0,6.0,5.0,7.0,4.0,8.0,9.0,8.0,5.0,9.0,10.0,13.0,10.0,9.0,12.0,11.0,8.0,8.0,11.0,11.0,12.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,2,2
20240,GBR,229,United Kingdom,2909,Sugar & Sweeteners,5142,Food,1000 tonnes,55.38,-3.44,2743.0,2389.0,2671.0,2612.0,2814.0,2764.0,2833.0,2700.0,2712.0,2752.0,2736.0,2498.0,2857.0,2356.0,2596.0,2604.0,2694.0,2346.0,2426.0,2443.0,2417.0,2639.0,2407.0,2413.0,2264.0,2322.0,2348.0,2376.0,2401.0,2387.0,2362.0,2320.0,2096.0,2292.0,2207.0,2309.0,2300.0,2217.0,2349.0,2211.0,2400.0,2453.0,2418.0,2500.0,2277.0,2009.0,2212.0,2245.0,2401.0,2280.0,2513.0,2436,2616


In [6]:
# Create a list of descriptions
description_list = [
    'Abbreviation of Area',
    'Full name of the country or region',
    'ISO country code or region abbreviation',
    'Numeric code for the country or region',
    'Numeric code for the item',
    'Name of the food item or product',
    'Numeric code for the element',
    'Type of data recorded',
    'Measurement unit for the recorded data',
    'Latitude coordinate of the location',
    'Longitude coordinate of the location',
]

In [7]:
# Function to generate a data dictionary
def generate_data_dictionary(df):
    data_dict = {
        'Column Name': [],
        'Data Type': [],
        'Description': []
    }
    
    for i, column in enumerate(df.columns):
        data_dict['Column Name'].append(column)
        data_dict['Data Type'].append(df[column].dtype)
        # Provide a description based on the description list or a default value
        description = description_list[i] if i < len(description_list) else 'N/A'
        # If the column name starts with 'Y' and is a year, format accordingly
        if column.startswith('Y') and column[1:].isdigit():
            description = f'Data for the year {column[1:]}.'
        data_dict['Description'].append(description)
    
    return pd.DataFrame(data_dict)

In [8]:
# Generate data dictionary
fao_data_dict = generate_data_dictionary(df_fao)

# Display data dictionary
fao_data_dict

Unnamed: 0,Column Name,Data Type,Description
0,Area Abbreviation,object,Abbreviation of Area
1,Area Code,int64,Full name of the country or region
2,Area,object,ISO country code or region abbreviation
3,Item Code,int64,Numeric code for the country or region
4,Item,object,Numeric code for the item
...,...,...,...
58,Y2009,float64,Data for the year 2009.
59,Y2010,float64,Data for the year 2010.
60,Y2011,float64,Data for the year 2011.
61,Y2012,int64,Data for the year 2012.


`FAOSTAT Dataset`


In [9]:
# Create a list of descriptions
description_mapping=[
    'Numeric Code for Domain',
    'Population Type',
    'Numeric code for the country or region',
    'Full name of the country or region',
    'Numeric code for the element',
    'Population Sexes',
    'Numeric code for the item',
    'Name of the food item or product',
    'Code for the Year',
    'Time of the Year',
    'Quantity of item',
    'Value of the data',
    'Flag',
    'Sources of item',
    'Note for item',
]

In [10]:
# Initialize a counter
j = 0

# Empty list to store the data for the DataFrame
data = []

for i in df_faostat.columns:
  data.append({'Column Name': i, 'Data Type': df_faostat[i].dtype, 'Data Description': description_mapping[j]})
  j += 1

In [11]:
# Create the DataFrame from the list of dictionaries
faostat_data_dict = pd.DataFrame(data)

# Display the data dictionary
faostat_data_dict

Unnamed: 0,Column Name,Data Type,Data Description
0,Domain Code,object,Numeric Code for Domain
1,Domain,object,Population Type
2,Area Code,int64,Numeric code for the country or region
3,Area,object,Full name of the country or region
4,Element Code,int64,Numeric code for the element
5,Element,object,Population Sexes
6,Item Code,int64,Numeric code for the item
7,Item,object,Name of the food item or product
8,Year Code,int64,Code for the Year
9,Year,int64,Time of the Year


#### Handling Missing Values


`FAO Dataset`


In [12]:
# Checking row null value per column
fao_null_values = {col: df_fao[col].isnull().sum() for col in df_fao.columns if df_fao[col].isnull().sum() > 0}

fao_null_values

{'Y1961': 3539,
 'Y1962': 3539,
 'Y1963': 3539,
 'Y1964': 3539,
 'Y1965': 3539,
 'Y1966': 3539,
 'Y1967': 3539,
 'Y1968': 3539,
 'Y1969': 3539,
 'Y1970': 3539,
 'Y1971': 3539,
 'Y1972': 3539,
 'Y1973': 3539,
 'Y1974': 3539,
 'Y1975': 3539,
 'Y1976': 3539,
 'Y1977': 3539,
 'Y1978': 3539,
 'Y1979': 3539,
 'Y1980': 3539,
 'Y1981': 3539,
 'Y1982': 3539,
 'Y1983': 3539,
 'Y1984': 3539,
 'Y1985': 3539,
 'Y1986': 3539,
 'Y1987': 3539,
 'Y1988': 3539,
 'Y1989': 3539,
 'Y1990': 3415,
 'Y1991': 3415,
 'Y1992': 987,
 'Y1993': 612,
 'Y1994': 612,
 'Y1995': 612,
 'Y1996': 612,
 'Y1997': 612,
 'Y1998': 612,
 'Y1999': 612,
 'Y2000': 349,
 'Y2001': 349,
 'Y2002': 349,
 'Y2003': 349,
 'Y2004': 349,
 'Y2005': 349,
 'Y2006': 104,
 'Y2007': 104,
 'Y2008': 104,
 'Y2009': 104,
 'Y2010': 104,
 'Y2011': 104}

In [13]:
df_fao[df_fao.isnull().any(axis=1)].head(1)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
679,ARM,1,Armenia,2511,Wheat and products,5521,Feed,1000 tonnes,40.07,45.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.0,21.0,24.0,25.0,10.0,32.0,58.0,55.0,78.0,20.0,20.0,42.0,69.0,59.0,46.0,67.0,57.0,56.0,61.0,65.0,92,93


In [14]:
print(f'Number of rows with null values: {len(df_fao[df_fao.isnull().any(axis=1)])}')

Number of rows with null values: 3539


`FAOSTAT Dataset`


In [15]:
faostat_null_values = {col: df_faostat[col].isnull().sum() for col in df_faostat.columns if df_faostat[col].isnull().sum() > 0}

faostat_null_values

{}

**Explanation & Justification:**

We handled missing data in the FAO and FAOSTAT dataset specifically for the years 1961 - 1991 by retaining the null values rather than replacing them with -1. Since the "y{YEAR_NUM}" columns are the only ones with null values, it is acceptable to leave these nulls as they are. Replacing these nulls with -1 could affect the summarization of statistics and affect the accuracy of visualizations.

Retaining null values preserves the integrity of our statistical summaries, such as mean and standard deviation calculations, which could be skewed by arbitrary replacement values like -1. By keeping nulls, we avoid misrepresenting missing data as zeroes, thus ensuring that our plots and analyses accurately reflect the data's true nature and maintain clarity in our visualizationons.


#### Removing & Checking Duplicates


`FAO Dataset`


In [16]:
# Duplicates in FAO
dupe_fao_sum= df_fao.duplicated().sum()

print(f'Total number of duplicated rows in FAO: {dupe_fao_sum}')

Total number of duplicated rows in FAO: 0


`FAOSTAT Dataset`


In [17]:
# Duplicates in FAOSTAT
dupe_faostat_sum = df_faostat.duplicated().sum()

print(f'Total number of duplicated rows in FAOSTAT: {dupe_faostat_sum}')

Total number of duplicated rows in FAOSTAT: 0


#### Standardize Column Names


`FAO Dataset`


In [18]:
df_fao.columns = (
    df_fao.columns
    .str.lower()                                 # Convert to lowercase
    .str.replace(' ', '_')                       # Replace spaces with underscores
    .str.replace(r'[^a-z0-9_]', '', regex=True)  # Remove special characters
)

`FAOSTAT Dataset`


In [19]:
df_faostat.columns = (
    df_faostat.columns
    .str.lower()                
    .str.replace(' ', '_')      
    .str.replace(r'[^a-z0-9_]', '', regex=True)  
)

#### Rename Similar Column Names


In [20]:
# Rename the columns
df_faostat.rename(columns={
    'item': 'population_item',
    'item_code': 'population_item_code',
    'element': 'population_element',
    'element_code': 'population_element_code',
    'unit': 'population_unit'
}, inplace=True)

df_faostat.columns

Index(['domain_code', 'domain', 'area_code', 'area', 'population_element_code',
       'population_element', 'population_item_code', 'population_item',
       'year_code', 'year', 'population_unit', 'value', 'flag',
       'flag_description', 'note'],
      dtype='object')

### **Data Merging**


In [21]:
df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area']
).sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
5212,CYP,50,Cyprus,2560,Coconuts - Incl Copra,5142,Food,1000 tonnes,35.13,33.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1,1,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,1179.551,X,International reliable sources,"UNDESA, Population Division – World Population..."
19745,UKR,230,Ukraine,2946,Animal fats,5142,Food,1000 tonnes,48.38,31.17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,411.0,386.0,270.0,275.0,181.0,296.0,262.0,245.0,246.0,236.0,268.0,263.0,220.0,220.0,214.0,218.0,203.0,218.0,213.0,209.0,227,231,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,44222.947,X,International reliable sources,"UNDESA, Population Division – World Population..."
7715,GHA,81,Ghana,2574,Rape and Mustard Oil,5142,Food,1000 tonnes,7.95,-1.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,28833.629,X,International reliable sources,"UNDESA, Population Division – World Population..."


In [22]:
len_merged_df = len(
    df_fao.merge(
        df_faostat, 
        how='inner', 
        on=['area_code', 'area']
    )
)

print(f'Number of rows of FAO dataframe: {len(df_fao)}')
print(f'Number of rows of FAOSTAT dataframe: {len(df_faostat)}')
print(f'Number of rows of merged dataframe: {len_merged_df}')

Number of rows of FAO dataframe: 21477
Number of rows of FAOSTAT dataframe: 231
Number of rows of merged dataframe: 21230


**Explanation & Justification:**

> We merged the two dataframes on both 'area_code' and 'area' not only because these columns are present in both datasets, but also provide a more unique identifier for each row in the merged result.

> We chose an _inner join_ for this merge to avoid introducing null values, since there are combinations of 'area_code' and 'area' that do not exist in the `FAO` and `FAOSTAT` dataframes.

> An inner join allows us to retain only the rows where there is a match in both datasets based on the specified columns. This is crucial for ensuring data consistency and accuracy, as it filters out any records that do not have corresponding matches in both dataframes.

> Unlike other types of joins, such as left or right joins, which would include unmatched records from one side or the other, resulting in null values, the inner join provides a focused dataset with only the relevant records that exist in both sources. This approach not only enhances the accuracy of the match but also improves the uniqueness and reliability of each row in the final merged dataset.


#### Create New Dataframe


In [23]:
merged_df = df_fao.merge(
    df_faostat, 
    how='inner', 
    on=['area_code', 'area'])

merged_df.sample(3)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note
16672,WSM,244,Samoa,2615,Bananas,5142,Food,1000 tonnes,-13.76,-172.1,5.0,4.0,4.0,5.0,6.0,8.0,11.0,11.0,10.0,11.0,11.0,12.0,13.0,13.0,14.0,14.0,13.0,13.0,12.0,11.0,13.0,12.0,13.0,14.0,14.0,15.0,15.0,15.0,8.0,9.0,6.0,6.0,6.0,6.0,8.0,9.0,10.0,12.0,13.0,13.0,13.0,14.0,12.0,11.0,12.0,14.0,15.0,15.0,16.0,17.0,16.0,15,15,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,196.44,X,International reliable sources,"UNDESA, Population Division – World Population..."
3928,CHN,96,"China, Hong Kong SAR",2618,Pineapples and products,5142,Food,1000 tonnes,22.4,114.11,0.0,0.0,0.0,0.0,0.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,4.0,4.0,8.0,8.0,5.0,12.0,10.0,12.0,15.0,11.0,13.0,12.0,11.0,10.0,10.0,12.0,11.0,12.0,13.0,14.0,15.0,16.0,14.0,12.0,14.0,11,11,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,7364.883,X,International reliable sources,"UNDESA, Population Division – World Population..."
10689,KIR,83,Kiribati,2918,Vegetables,5142,Food,1000 tonnes,-3.37,-168.73,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,6.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,7.0,6.0,6.0,6.0,6.0,5.0,5,6,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000 persons,116.398,X,International reliable sources,"UNDESA, Population Division – World Population..."


### **Feature Engineering**


##### Column: Years Existing


In [24]:
year_cols = [col for col in merged_df.columns if col.startswith('y') and col[1:].isdigit()]
merged_df['years_existing'] = merged_df[year_cols].notnull().sum(axis=1)

merged_df[['area_code', 'area', 'years_existing']].sample(3)

Unnamed: 0,area_code,area,years_existing
6178,59,Egypt,53
5611,54,Denmark,53
13967,156,New Zealand,53


**Explanation & Justification:**

> We created a new column for _years existing_ to count the number of years for which valid (non-negative) data exists per record in the dataset.


##### Column: Average Production


In [25]:
merged_df['average_production'] = merged_df[year_cols].mean(axis=1)

merged_df[['area_code', 'area', 'years_existing', 'average_production']].sample(3)

Unnamed: 0,area_code,area,years_existing,average_production
4928,98,Croatia,22,0.0
7995,86,Grenada,53,0.018868
8670,95,Honduras,53,0.0


**Explanation & Justification:**

> We created a new column for _average production_ to calculate the mean production over a specified range of years within the dataset. It summarizes production levels over time per record.


##### Column: Value per Capita


In [26]:
merged_df['population_unit'] = merged_df['population_unit'].str.extract('(\d+)').astype(int)

merged_df['population_unit'].unique()


array([1000])

In [27]:
merged_df[['population_unit', 'value']].sample(3)

Unnamed: 0,population_unit,value
12238,1000,436.33
4924,1000,4189.353
1870,1000,374.681


In [28]:
merged_df['value_per_capita'] = merged_df['value'] / merged_df['population_unit']

merged_df[['value', 'population_unit', 'value_per_capita']].sample(3)

Unnamed: 0,value,population_unit,value_per_capita
7995,107.825,1000,0.107825
8540,10981.229,1000,10.981229
7616,82114.224,1000,82.114224


**Explanation & Justification:**

> We created a new column for _value per capita_ by dividing the `value` by the `population_unit`.


### **Final Merged Dataframe**


In [29]:
merged_df.sample(5)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita
18148,SDN,276,Sudan,2619,Dates,5142,Food,1000 tonnes,12.86,30.22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,433,438,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,40533.33,X,International reliable sources,"UNDESA, Population Division – World Population...",2,435.5,40.53333
17830,ESP,203,Spain,2517,Millet and products,5521,Feed,1000 tonnes,40.46,-3.75,7.0,7.0,16.0,7.0,41.0,93.0,163.0,91.0,21.0,5.0,47.0,37.0,21.0,21.0,12.0,9.0,98.0,98.0,17.0,4.0,5.0,22.0,0.0,1.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,6.0,8.0,4.0,5.0,6.0,4.0,3.0,5.0,7.0,11.0,6.0,6.0,6.0,6.0,20.0,7.0,6.0,6.0,7,11,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,46354.321,X,International reliable sources,"UNDESA, Population Division – World Population...",53,19.226415,46.354321
2784,BRN,26,Brunei Darussalam,2949,Eggs,5142,Food,1000 tonnes,4.54,114.73,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,6.0,4.0,5.0,4.0,4.0,4.0,7.0,6.0,6.0,6.0,6.0,5.0,6.0,6.0,6.0,6.0,7,7,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,428.697,X,International reliable sources,"UNDESA, Population Division – World Population...",53,3.264151,0.428697
10682,KIR,83,Kiribati,2907,Starchy Roots,5521,Feed,1000 tonnes,-3.37,-168.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,116.398,X,International reliable sources,"UNDESA, Population Division – World Population...",53,0.0,0.116398
3468,CAN,33,Canada,2735,"Meat, Other",5521,Feed,1000 tonnes,56.13,-106.35,8.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,8.0,6.0,4.0,4.0,3.0,3.0,3.0,4.0,5.0,5.0,17.0,7.0,6.0,6.0,6.0,6.0,3.0,7.0,8.0,10.0,11.0,11,14,OA,Annual population,511,Total Population - Both sexes,3010,Population - Est. & Proj.,2017,2017,1000,36624.199,X,International reliable sources,"UNDESA, Population Division – World Population...",53,5.075472,36.624199


In [30]:
# Check if there are rows with null values (excluding the year columns)

non_year_columns = merged_df.columns.difference(year_cols)

merged_df[merged_df[non_year_columns].isnull().any(axis=1)].head(5)


Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,y1966,y1967,y1968,y1969,y1970,y1971,y1972,y1973,y1974,y1975,y1976,y1977,y1978,y1979,y1980,y1981,y1982,y1983,y1984,y1985,y1986,y1987,y1988,y1989,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013,domain_code,domain,population_element_code,population_element,population_item_code,population_item,year_code,year,population_unit,value,flag,flag_description,note,years_existing,average_production,value_per_capita


### **Export to CSV**


In [31]:
merged_df.to_csv('merged_df.csv', index=False)