In [1]:
import pandas as pd
from collections import Counter

In [2]:
# load dataset
dataset = pd.read_csv('../../data/processed/wfp_food_prices_moz.csv')

In [3]:
dataset.head()

Unnamed: 0,province,district,market,latitude,longitude,category,commodity,unit,pricetype,price(MZN),year,month
0,Maputo City,Cidade_De_Maputo,Maputo,-25.965278,32.589167,cereals and tubers,Maize (white),KG,Retail,1.34,1992,11
1,Gaza,Chokwe,Chokwe,-24.533333,32.983333,cereals and tubers,Maize (white),KG,Retail,1.53,1992,12
2,Inhambane,Maxixe,Maxixe,-23.859722,35.347222,cereals and tubers,Maize (white),KG,Retail,1.69,1992,12
3,Maputo City,Cidade_De_Maputo,Maputo,-25.965278,32.589167,cereals and tubers,Maize (white),KG,Retail,1.55,1992,12
4,Gaza,Chokwe,Chokwe,-24.533333,32.983333,cereals and tubers,Maize (white),KG,Retail,1.67,1993,1


In [4]:
# check months of the year 2023
data = dataset.loc[(dataset['year'] == 2023)]

data['month'].unique()

array([1, 2], dtype=int64)

In [5]:
# remove data from years outside of 2000 to 2022
dataset = dataset.loc[(dataset['year'] >= 2000) & (dataset['year'] < 2023)]

# remove 'non-food' category data
dataset = dataset.loc[(dataset['category'] != 'non-food')]

In [6]:
# drop features
dataset = dataset.drop(['district', 'market', 'latitude', 'longitude', 'pricetype'], axis=1)

In [7]:
# group dataset
dataset = dataset.groupby(['province', 'category', 'commodity', 'unit', 'year', 'month'])['price(MZN)'].mean().reset_index()

In [8]:
dataset.head()

Unnamed: 0,province,category,commodity,unit,year,month,price(MZN)
0,Cabo_Delgado,cereals and tubers,Cassava (dry),KG,2013,7,10.0
1,Cabo_Delgado,cereals and tubers,Cassava (dry),KG,2013,8,10.0
2,Cabo_Delgado,cereals and tubers,Cassava (dry),KG,2013,9,10.0
3,Cabo_Delgado,cereals and tubers,Cassava (dry),KG,2013,12,16.67
4,Cabo_Delgado,cereals and tubers,Cassava (fresh),KG,2013,7,20.0


In [9]:
# save grouped dataset
dataset.to_csv('../../data/processed/food_prices_grouped.csv', index=False)

In [10]:
# concatenate commodity with its unit
dataset['commodity_unit'] = dataset['commodity'] + '_' + dataset['unit']
dataset.drop(['commodity', 'unit'], axis=1, inplace=True)

dataset.rename(columns={'commodity_unit': 'commodity'}, inplace=True)

In [11]:
# count categories
columns = dataset.columns

for i in columns:
    print(f'{i}: {len(dataset[i].unique())}')

province: 11
category: 6
year: 23
month: 12
price(MZN): 10067
commodity: 54


In [12]:
# filter commodities with few occurrences
commodities = dataset['commodity'].values

max = []
min = []
k = 100

for commodity, occurrences in Counter(commodities).items():
    if occurrences >= k:
        max.append(commodity)
        print(f'{commodity}: {occurrences}')
        
    else:
        min.append(commodity)
        # print(f'{commodity}: {occurrences}')
        
    # print(f'{commodity}: {occurrences}')

print()
print(f'total commodities with more than {k} occurrences: {len(max)}')
print(f'total commodities with less than {k} occurrences: {len(min)}')

Cassava (dry)_KG: 208
Maize (white)_KG: 2506
Maize meal_25 KG: 193
Maize meal (white, first grade)_KG: 1282
Maize meal (white, with bran)_KG: 731
Maize meal (white, without bran)_KG: 817
Potatoes_KG: 205
Rice_25 KG: 203
Rice_KG: 1523
Rice (imported)_KG: 2098
Rice (local)_KG: 521
Sweet potatoes_KG: 237
Wheat flour (local)_KG: 1287
Eggs_30 pcs: 203
Fish_500 G: 190
Salt (iodised)_KG: 203
Sugar_KG: 203
Sugar (brown, imported)_KG: 157
Sugar (brown, local)_KG: 2053
Oil (vegetable)_5 L: 203
Oil (vegetable, imported)_L: 332
Oil (vegetable, local)_L: 2062
Beans (butter)_KG: 830
Beans (catarino)_KG: 788
Beans (dry)_KG: 1259
Beans (magnum)_KG: 508
Cowpeas_KG: 1327
Groundnuts_KG: 198
Groundnuts (Mix)_KG: 597
Groundnuts (large, shelled)_KG: 1133
Groundnuts (small, shelled)_KG: 1148
Cabbage_KG: 239
Carrots_KG: 235
Cassava leaves_KG: 193
Coconut_Unit: 201
Garlic_KG: 203
Kale_KG: 198
Onions_KG: 239
Tomatoes_KG: 243
Cassava flour_KG: 172

total commodities with more than 100 occurrences: 40
total commo

In [13]:
# remove commodities with few occurrences
dataset = dataset.loc[dataset['commodity'].isin(max)]

In [14]:
# convert categorical features with one hot encoding
features = ['province',	'category',	'commodity']

dataset_encoded = dataset
    
for i in features:
    dataset_encoded = pd.get_dummies(dataset_encoded, columns=[i], prefix=[i])

In [15]:
dataset_encoded.head()

Unnamed: 0,year,month,price(MZN),province_Cabo_Delgado,province_Gaza,province_Inhambane,province_Manica,province_Maputo,province_Maputo City,province_Nampula,...,commodity_Rice (local)_KG,commodity_Rice_25 KG,commodity_Rice_KG,commodity_Salt (iodised)_KG,"commodity_Sugar (brown, imported)_KG","commodity_Sugar (brown, local)_KG",commodity_Sugar_KG,commodity_Sweet potatoes_KG,commodity_Tomatoes_KG,commodity_Wheat flour (local)_KG
0,2013,7,10.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013,8,10.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013,9,10.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,12,16.67,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2000,2,1.14,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# save encoded dataset
dataset_encoded.to_csv('../../data/processed/food_prices_encoded.csv', index=False)