In [1]:
import pandas as pd
import numpy as np

# 1. Suite of Food Security Indicators

In [None]:
df_fsi = pd.read_csv('/content/drive/MyDrive/Projeto MC536/datasets/FAO_Suite_of_Food_Security_Indicators_3-21-2025.csv')
# Drop unecessary columns
df_fsi.drop(["Domain", "Year Code", "Domain Code", "Element", "Element Code", "Item Code", "Note", "Flag", "Flag Description"], axis=1, inplace=True)
# drop all rows where one of the columns is empty
df_fsi.dropna(inplace=True)
# create Start Year column 2000-2002	-> 2000
df_fsi['Start Year'] = df_fsi['Year'].str.split('-').str[0].astype(int)

def filter_value(value):
    # If value = "<x", replace with 0, no matter what x
    if isinstance(value, str) and value.startswith('<'):
        return 0
    return value

# Apply the filter_value function to the 'Value' column
df_fsi['Value'] = df_fsi['Value'].apply(filter_value)

df_fsi.rename(columns={'Unit': 'fs_indicator_unit', 'Item': 'fs_indicator_name', 'Start Year': 'fs_measurement_year', 'Value': 'fs_measurement_value', 'Area Code (M49)': 'country_iso_code', 'Area': 'country_name'}, inplace=True)
df_fsi = df_fsi[['country_iso_code', 'country_name', 'fs_measurement_year', 'fs_measurement_value', 'fs_indicator_name', 'fs_indicator_unit']]
df_fsi.sample(2)

Unnamed: 0,country_iso_code,country_name,fs_measurement_year,fs_measurement_value,fs_indicator_name,fs_indicator_unit
96271,462,Maldives,2010,88,Percentage of population using at least basic ...,%
11433,48,Bahrain,2018,2525,Average dietary energy requirement (kcal/cap/day),kcal/cap/d


In [None]:
# Making sure the columns have the correct data types
df_fsi['country_iso_code'] = df_fsi['country_iso_code'].astype(str)
df_fsi['country_name'] = df_fsi['country_name'].astype(str)
df_fsi['fs_measurement_year'] = df_fsi['fs_measurement_year'].astype(int)
df_fsi['fs_measurement_value'] = df_fsi['fs_measurement_value'].astype(float)
df_fsi['fs_indicator_name'] = df_fsi['fs_indicator_name'].astype(str)
df_fsi['fs_indicator_unit'] = df_fsi['fs_indicator_unit'].astype(str)

# Saving CSV
with open("/content/drive/MyDrive/Projeto MC536/preprocessed_datasets/suite_of_food_security_indicators.csv", "w") as f:
  df_fsi.to_csv(f, index=False)

# 2. Production, Crops and Livestock

In [2]:
df_pcl = pd.read_csv('/content/drive/MyDrive/Projeto MC536/datasets/Production_Crops_Livestock_E_All_Data.csv')
df_pcl.head(2)

  df_pcl = pd.read_csv('/content/drive/MyDrive/Projeto MC536/datasets/Production_Crops_Livestock_E_All_Data.csv')


Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Unit,Y1961,...,Y2020N,Y2021,Y2021F,Y2021N,Y2022,Y2022F,Y2022N,Y2023,Y2023F,Y2023N
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,ha,0.0,...,,36862.0,A,,36462.0,A,,37000.0,A,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5412,Yield,kg/ha,,...,,1743.2,A,,1742.0,A,,1810.8,A,


In [3]:
# Identify columns representing years (format: 'Y' followed by exactly 4 digits)
year_cols = df_pcl.columns[df_pcl.columns.str.match(r'^Y\d{4}$')]

# Reshape the DataFrame: turn year columns into rows
df_pcl_melted = df_pcl.melt(
    id_vars=['Area Code (M49)', 'Area', 'Item', 'Element Code', 'Element', 'Unit'],
    value_vars=year_cols,
    var_name='Year',
    value_name='Value'
)

# Convert year column from 'YXXXX' to integer
df_pcl_melted['Year'] = df_pcl_melted['Year'].str[1:].astype(int)

# Remove rows with missing production values
df_pcl_melted = df_pcl_melted.dropna(subset=['Value'])

# Keep only rows with relevant production elements
df_pcl_melted = df_pcl_melted[df_pcl_melted['Element'].isin([
    'Production', 'Area harvested', 'Producing Animals/Slaughtered'
])]

# Drop unnecessary column
df_pcl_melted = df_pcl_melted.drop(columns=['Element Code'])

# Rename columns for clarity and standardization
df_pcl_melted = df_pcl_melted.rename(columns={
    'Area Code (M49)': 'country_iso_code',
    'Area': 'country_name',
    'Item': 'a_product_name',
    'Element': 'a_production_element',
    'Unit': 'a_production_unit',
    'Year': 'a_production_year',
    'Value': 'a_production_value'
})

# Define product type based on production elements
product_type_mapping = df_pcl_melted.groupby('a_product_name')['a_production_element'].apply(
    lambda x: 'Animal' if 'Producing Animals/Slaughtered' in x.unique() else 'Vegetal'
).reset_index()
product_type_mapping.columns = ['a_product_name', 'a_product_type']

# Add product type information to the main DataFrame
df_pcl_melted = df_pcl_melted.merge(product_type_mapping, on='a_product_name', how='left')

# Separate vegetal and animal product data
vegetal = df_pcl_melted[df_pcl_melted['a_product_type'] == 'Vegetal']
animal = df_pcl_melted[df_pcl_melted['a_product_type'] == 'Animal']

# Pivot vegetal data: get separate columns for production and harvested area
vegetal_pivot = vegetal.pivot_table(
    index=['country_iso_code', 'country_name', 'a_product_name', 'a_production_year', 'a_product_type'],
    columns='a_production_element',
    values='a_production_value',
    aggfunc='first'
).reset_index()
vegetal_pivot.rename(columns={
    'Production': 'a_production_tons',
    'Area harvested': 'a_production_harvested_area'
}, inplace=True)
vegetal_pivot['a_production_n_animals'] = np.nan  # Not applicable to vegetal products

# Adjust animal values if unit is in thousands
animal.loc[
    (animal['a_production_element'] == 'Producing Animals/Slaughtered') &
    (animal['a_production_unit'].isin(['1000 An', '1000 No'])),
    'a_production_value'
] *= 1000

# Pivot animal data: get separate columns for production and number of animals
animal_pivot = animal.pivot_table(
    index=['country_iso_code', 'country_name', 'a_product_name', 'a_production_year', 'a_product_type'],
    columns='a_production_element',
    values='a_production_value',
    aggfunc='first'
).reset_index()
animal_pivot.rename(columns={
    'Production': 'a_production_tons',
    'Producing Animals/Slaughtered': 'a_production_n_animals'
}, inplace=True)
animal_pivot['a_production_harvested_area'] = np.nan  # Not applicable to animal products

# Combine vegetal and animal data into one DataFrame
df_pcl_melted_filtered = pd.concat([vegetal_pivot, animal_pivot], ignore_index=True)

# Reorder columns to desired final structure
columns_order = [
    'country_iso_code', 'country_name', 'a_production_year', 'a_production_tons',
    'a_production_n_animals', 'a_production_harvested_area', 'a_product_name', 'a_product_type'
]
df_pcl_melted_filtered = df_pcl_melted_filtered[columns_order]

# Remove any duplicated rows based on key identifying fields
df_pcl_melted_filtered.drop_duplicates(
    subset=['country_iso_code', 'country_name', 'a_product_name', 'a_production_year'],
    inplace=True
)

# Remove the first character from the ISO country code
df_pcl_melted_filtered['country_iso_code'] = df_pcl_melted_filtered['country_iso_code'].str[1:]

# Ensure correct data types for all columns
df_pcl_melted_filtered['country_iso_code'] = df_pcl_melted_filtered['country_iso_code'].astype(str)
df_pcl_melted_filtered['country_name'] = df_pcl_melted_filtered['country_name'].astype(str)
df_pcl_melted_filtered['a_production_year'] = df_pcl_melted_filtered['a_production_year'].astype(int)
df_pcl_melted_filtered['a_production_tons'] = df_pcl_melted_filtered['a_production_tons'].astype(float)
df_pcl_melted_filtered['a_production_n_animals'] = df_pcl_melted_filtered['a_production_n_animals'].astype(int, errors='ignore')
df_pcl_melted_filtered['a_production_harvested_area'] = df_pcl_melted_filtered['a_production_harvested_area'].astype(float, errors='ignore')
df_pcl_melted_filtered['a_product_name'] = df_pcl_melted_filtered['a_product_name'].astype(str)
df_pcl_melted_filtered['a_product_type'] = df_pcl_melted_filtered['a_product_type'].astype(str)

# Display a specific example for verification
df_pcl_melted_filtered[
    (df_pcl_melted_filtered['country_name'] == 'Brazil') &
    (df_pcl_melted_filtered['a_product_name'] == 'Meat of chickens, fresh or chilled')
]

a_production_element,country_iso_code,country_name,a_production_year,a_production_tons,a_production_n_animals,a_production_harvested_area,a_product_name,a_product_type
1302327,076,Brazil,1961,122770.00,1.227700e+08,,"Meat of chickens, fresh or chilled",Animal
1302328,076,Brazil,1962,130070.00,1.300740e+08,,"Meat of chickens, fresh or chilled",Animal
1302329,076,Brazil,1963,157970.00,1.579720e+08,,"Meat of chickens, fresh or chilled",Animal
1302330,076,Brazil,1964,166580.00,1.665770e+08,,"Meat of chickens, fresh or chilled",Animal
1302331,076,Brazil,1965,214370.00,2.143700e+08,,"Meat of chickens, fresh or chilled",Animal
...,...,...,...,...,...,...,...,...
1302385,076,Brazil,2019,13516524.73,5.805315e+09,,"Meat of chickens, fresh or chilled",Animal
1302386,076,Brazil,2020,13787480.27,5.953808e+09,,"Meat of chickens, fresh or chilled",Animal
1302387,076,Brazil,2021,14329000.00,6.111074e+09,,"Meat of chickens, fresh or chilled",Animal
1302388,076,Brazil,2022,14524000.00,6.109829e+09,,"Meat of chickens, fresh or chilled",Animal


In [None]:
with open("/content/drive/MyDrive/Projeto MC536/preprocessed_datasets/production_crops_livestock.csv", "w") as f:
  df_pcl_melted_filtered.to_csv(f, index=False)

# 3. FAO Data From Agricultural Censuses

In [4]:
df_ac = pd.read_csv('/content/drive/MyDrive/Projeto MC536/datasets/FAO_Structural_data_from_agricultural_censuses_3-21-2025.csv')
print(f"Original shape: {df_ac.shape}")
# Drop Domain Code, Domain, Element Code, Flag,Flag Description, Note
df_ac.drop(["Domain Code", "Domain", "Element Code", "Flag", "Flag Description", "Note", "Item Code"], axis=1, inplace=True)
# drop all rows where one of the columns is empty
df_ac.dropna(inplace=True)
print(f"New shape: {df_ac.shape}")
# Rename columns: Area Code (M49) -> country_iso_code, Area -> country_name, WCA Round -> agricultural_census_decade, Census Year -> agricultural_census_year
df_ac.rename(columns={'Area Code (M49)': 'country_iso_code', 'Area': 'country_name', 'WCA Round': 'agricultural_census_decade', 'Census Year': 'agricultural_census_year'}, inplace=True)
df_ac.head(5)

Original shape: (5069, 15)
New shape: (5048, 8)


Unnamed: 0,country_iso_code,country_name,Element,Item,agricultural_census_decade,agricultural_census_year,Unit,Value
0,8,Albania,Number,Holdings with land size 0-<1,1990,1995,No,274000.0
1,8,Albania,Area,Holdings with land size 0-<1,2000,1998,ha,128508.0
2,8,Albania,Number,Holdings with land size 0-<1,2000,1998,No,279793.0
3,8,Albania,Area,Holdings with land size 0-<1,2010,2012,ha,99758.0
4,8,Albania,Number,Holdings with land size 0-<1,2010,2012,No,215034.0


In [5]:
# Print the initial shape of the DataFrame
print(f"Initial shape: {df_ac.shape}")

# Pivot the DataFrame to convert 'Element' values into separate columns
df_pivot = df_ac.pivot(
    index=['country_iso_code', 'country_name', 'agricultural_census_decade', 'agricultural_census_year', 'Item'],
    columns='Element',
    values='Value'
).reset_index()

# Rename the pivoted columns for clarity
df_pivot.rename(columns={
    'Area': 'total_area_ha',
    'Number': 'number_of_properties'
}, inplace=True)

# Extract farm size lower and upper limits from the 'Item' column using regex
df_pivot[['farm_size_lower_limit', 'farm_size_upper_limit']] = (
    df_pivot['Item']
    .str.extract(r'size (\d+)-<(\d+)')
    .astype(float)
)

# Select final columns and remove rows with missing values
df_ac_filtered = df_pivot[[
    'country_iso_code',
    'country_name',
    'agricultural_census_decade',
    'agricultural_census_year',
    'total_area_ha',
    'number_of_properties',
    'farm_size_lower_limit',
    'farm_size_upper_limit'
]].dropna()

# Ensure all columns have the correct data types
df_ac_filtered['country_iso_code'] = df_ac_filtered['country_iso_code'].astype(str)
df_ac_filtered['country_name'] = df_ac_filtered['country_name'].astype(str)
df_ac_filtered['agricultural_census_decade'] = df_ac_filtered['agricultural_census_decade'].astype(str)

# Convert agricultural census year to string, then truncate and convert to integer
df_ac_filtered['agricultural_census_year'] = df_ac_filtered['agricultural_census_year'].astype(str)
df_ac_filtered['agricultural_census_year'] = df_ac_filtered['agricultural_census_year'].str[:4]
df_ac_filtered['agricultural_census_year'] = df_ac_filtered['agricultural_census_year'].astype(int)

df_ac_filtered['total_area_ha'] = df_ac_filtered['total_area_ha'].astype(float)
df_ac_filtered['number_of_properties'] = df_ac_filtered['number_of_properties'].astype(int)
df_ac_filtered['farm_size_lower_limit'] = df_ac_filtered['farm_size_lower_limit'].astype(int)
df_ac_filtered['farm_size_upper_limit'] = df_ac_filtered['farm_size_upper_limit'].astype(int)

# Print the final shape of the cleaned DataFrame
print(f"Final shape: {df_ac_filtered.shape}")

# Sort the DataFrame by country, decade, year, and farm size lower limit
df_ac_filtered.sort_values(
    by=['country_name', 'agricultural_census_decade', 'agricultural_census_year', 'farm_size_lower_limit'],
    inplace=True
)

# Display the first 5 rows
df_ac_filtered.head(5)


Initial shape: (5048, 8)
Final shape: (2278, 8)


Element,country_iso_code,country_name,agricultural_census_decade,agricultural_census_year,total_area_ha,number_of_properties,farm_size_lower_limit,farm_size_upper_limit
2,8,Albania,2000,1998,128508.0,279793,0,1
3,8,Albania,2000,1998,198921.0,140377,1,2
4,8,Albania,2000,1998,1562069.0,46639,2,5
5,8,Albania,2010,2012,99758.0,215034,0,1
6,8,Albania,2010,2012,109987.0,76550,1,2


In [None]:
with open("/content/drive/MyDrive/Projeto MC536/preprocessed_datasets/agricultural_census.csv", "w") as f:
  df_ac_filtered.to_csv(f, index=False)