In [None]:
# required packages
import pandas as pd

In [3]:
# Loading data
raw_data = pd.read_csv('Datasets_MS_Project/Macro_economic_indicators/Macro-Statistics_Key_Indicators_E_All_Data_(Normalized)/Macro-Statistics_Key_Indicators_E_All_Data_(Normalized).csv', encoding='latin1')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,22008,Gross Domestic Product,6224,Value Standard Local Currency,1970,1970,million SLC,78.697146,X,
1,2,'004,Afghanistan,22008,Gross Domestic Product,6224,Value Standard Local Currency,1971,1971,million SLC,82.397024,X,
2,2,'004,Afghanistan,22008,Gross Domestic Product,6224,Value Standard Local Currency,1972,1972,million SLC,71.797487,X,
3,2,'004,Afghanistan,22008,Gross Domestic Product,6224,Value Standard Local Currency,1973,1973,million SLC,77.997271,X,
4,2,'004,Afghanistan,22008,Gross Domestic Product,6224,Value Standard Local Currency,1974,1974,million SLC,96.996607,X,


In [4]:
raw_data['Item'].unique()

array(['Gross Domestic Product', 'Gross Fixed Capital Formation',
       'Value Added (Agriculture, Forestry and Fishing)',
       'Value Added (Total Manufacturing)', 'Gross National Income',
       'Value Added (Manufacture of food, beverages and tobacco products)',
       'Value Added (Manufacture of food and beverages)',
       'Value Added (Manufacture of tobacco products)',
       'Gross Output (Agriculture, Forestry and Fishing)',
       'Value Added (Agriculture)', 'Gross Output (Agriculture)'],
      dtype=object)

In [5]:
raw_data['Element'].unique()

array(['Value Standard Local Currency', 'Value US$',
       'Value US$ per capita',
       'Value Standard Local Currency, 2015 prices',
       'Value US$ per capita, 2015 prices', 'Value US$, 2015 prices',
       'Annual growth Standard Local Currency',
       'Annual growth Standard Local Currency, 2015 prices',
       'Annual growth US$', 'Annual growth US$ per capita',
       'Annual growth US$, 2015 prices',
       'Annual growth US$ per capita, 2015 prices', 'Share of GDP US$',
       'Share of GDP US$, 2015 prices',
       'Share of GDP Standard Local Currency',
       'Share of GDP Standard Local Currency, 2015 prices',
       'Ratio of Value Added (Agriculture, Forestry and Fishing) Standard Local Currency',
       'Ratio of Value Added (Agriculture, Forestry and Fishing) US$',
       'Share of Value Added (Total Manufacturing) Standard Local Currency',
       'Share of Value Added (Total Manufacturing) US$'], dtype=object)

For our project, we would choose following elements that are relevant:
- 'Value US$, 2015 prices'
- 'Annual growth US$, 2015 prices'
- 'Share of GDP US$, 2015 prices'

## Element: Value US$, 2015 prices

In [13]:
# filter data for required rows
filtered_data_1 = raw_data.loc[
    raw_data['Element']=='Value US$, 2015 prices'
]

filtered_data_1['Item'].unique()

array(['Gross Domestic Product', 'Gross Fixed Capital Formation',
       'Value Added (Agriculture, Forestry and Fishing)',
       'Value Added (Total Manufacturing)',
       'Value Added (Manufacture of food, beverages and tobacco products)',
       'Value Added (Manufacture of food and beverages)',
       'Value Added (Manufacture of tobacco products)',
       'Value Added (Agriculture)'], dtype=object)

Through macro-economic indicators, we are aiming to capture signals that reflect 
agricultural productivity, investment, and demand from downstream industries. 
Following are the most relevant indicators:
- 'Gross Domestic Product'
- 'Gross Fixed Capital Formation'
- 'Value Added (Agriculture, Forestry and Fishing)'
- 'Value Added (Manufacture of food, beverages and tobacco products)'
- 'Value Added (Manufacture of food and beverages)'
- 'Value Added (Agriculture)'

In [14]:
# keeping only the desired elements
items_to_keep = [
    'Gross Domestic Product', 
    'Gross Fixed Capital Formation',
    'Value Added (Agriculture, Forestry and Fishing)',
    'Value Added (Manufacture of food, beverages and tobacco products)',
    'Value Added (Manufacture of food and beverages)',
    'Value Added (Agriculture)'
]

filtered_data_1 = filtered_data_1.loc[
    filtered_data_1['Item'].isin(items_to_keep)
]

filtered_data_1['Item'].value_counts()

Item
Gross Domestic Product                                               12724
Gross Fixed Capital Formation                                        12670
Value Added (Agriculture, Forestry and Fishing)                      12597
Value Added (Agriculture)                                             1594
Value Added (Manufacture of food, beverages and tobacco products)      693
Value Added (Manufacture of food and beverages)                        664
Name: count, dtype: int64

So, we have a lots of missing values for the indicators: 
- Value Added (Agriculture)
- Value Added (Manufacture of food, beverages and tobacco products)
- Value Added (Manufacture of food and beverages)

Hence, we will drop these indicators from our list.

In [15]:
items_to_drop = [
    'Value Added (Agriculture)',
    'Value Added (Manufacture of food, beverages and tobacco products)',
    'Value Added (Manufacture of food and beverages)'
]

filtered_data_1 = filtered_data_1.loc[
    (~filtered_data_1['Item'].isin(items_to_drop))
]

filtered_data_1['Item'].value_counts()

Item
Gross Domestic Product                             12724
Gross Fixed Capital Formation                      12670
Value Added (Agriculture, Forestry and Fishing)    12597
Name: count, dtype: int64

In [19]:
filtered_data_1['Unit'].unique()

array(['million USD'], dtype=object)

In [16]:
# restructuring data from long-format to wide-format
pivoted_data_1 = filtered_data_1.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# resetting row index
pivoted_data_1.reset_index(inplace=True)

# setting column index name to None
pivoted_data_1.columns.name = None

pivoted_data_1.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Gross Domestic Product,Gross Fixed Capital Formation,"Value Added (Agriculture, Forestry and Fishing)"
0,1,Armenia,1990,1990,5747.758431,2693.900711,918.064586
1,1,Armenia,1991,1991,5076.491543,1523.89975,1196.038694
2,1,Armenia,1992,1992,2956.732211,529.009513,1400.171646
3,1,Armenia,1993,1993,2696.219466,351.09065,1316.887415
4,1,Armenia,1994,1994,2839.851925,608.364684,628.394537


In [17]:
pivoted_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12724 entries, 0 to 12723
Data columns (total 7 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Area Code                                        12724 non-null  int64  
 1   Area                                             12724 non-null  object 
 2   Year Code                                        12724 non-null  int64  
 3   Year                                             12724 non-null  int64  
 4   Gross Domestic Product                           12724 non-null  float64
 5   Gross Fixed Capital Formation                    12670 non-null  float64
 6   Value Added (Agriculture, Forestry and Fishing)  12597 non-null  float64
dtypes: float64(3), int64(3), object(1)
memory usage: 696.0+ KB


In [18]:
# renaming columns
cleaned_data_1 = pivoted_data_1.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Gross Domestic Product': 'gross_domestic_product',
        'Gross Fixed Capital Formation': 'gross_fixed_capital_formation',
        'Value Added (Agriculture, Forestry and Fishing)': 'value_added_ag_forest_fish'
    }
)
cleaned_data_1.head()

Unnamed: 0,area_code,area,year_code,year,gross_domestic_product,gross_fixed_capital_formation,value_added_ag_forest_fish
0,1,Armenia,1990,1990,5747.758431,2693.900711,918.064586
1,1,Armenia,1991,1991,5076.491543,1523.89975,1196.038694
2,1,Armenia,1992,1992,2956.732211,529.009513,1400.171646
3,1,Armenia,1993,1993,2696.219466,351.09065,1316.887415
4,1,Armenia,1994,1994,2839.851925,608.364684,628.394537


## Element: Annual growth US$, 2015 prices

In [20]:
# filter data for required rows
filtered_data_2 = raw_data.loc[
    raw_data['Element']=='Annual growth US$, 2015 prices'
]

filtered_data_2['Item'].unique()

array(['Gross Domestic Product', 'Gross Fixed Capital Formation',
       'Value Added (Agriculture, Forestry and Fishing)',
       'Value Added (Total Manufacturing)',
       'Value Added (Manufacture of food, beverages and tobacco products)',
       'Value Added (Manufacture of food and beverages)',
       'Value Added (Manufacture of tobacco products)',
       'Value Added (Agriculture)'], dtype=object)

In [21]:
# keeping only the desired elements
items_to_keep = [
    'Gross Domestic Product', 
    'Gross Fixed Capital Formation',
    'Value Added (Agriculture, Forestry and Fishing)',
    'Value Added (Manufacture of food, beverages and tobacco products)',
    'Value Added (Manufacture of food and beverages)',
    'Value Added (Agriculture)'
]

filtered_data_2 = filtered_data_2.loc[
    filtered_data_2['Item'].isin(items_to_keep)
]

filtered_data_2['Item'].value_counts()

Item
Gross Domestic Product                                               12471
Gross Fixed Capital Formation                                        12418
Value Added (Agriculture, Forestry and Fishing)                      12347
Value Added (Agriculture)                                             1510
Value Added (Manufacture of food, beverages and tobacco products)      613
Value Added (Manufacture of food and beverages)                        610
Name: count, dtype: int64

So, we have a lots of missing values for the indicators: 
- Value Added (Agriculture)
- Value Added (Manufacture of food, beverages and tobacco products)
- Value Added (Manufacture of food and beverages)

Hence, we will drop these indicators from our list.

In [22]:
items_to_drop = [
    'Value Added (Agriculture)',
    'Value Added (Manufacture of food, beverages and tobacco products)',
    'Value Added (Manufacture of food and beverages)'
]

filtered_data_2 = filtered_data_2.loc[
    (~filtered_data_2['Item'].isin(items_to_drop))
]

filtered_data_2['Item'].value_counts()

Item
Gross Domestic Product                             12471
Gross Fixed Capital Formation                      12418
Value Added (Agriculture, Forestry and Fishing)    12347
Name: count, dtype: int64

In [23]:
# restructuring data from long-format to wide-format
pivoted_data_2 = filtered_data_2.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# resetting row index
pivoted_data_2.reset_index(inplace=True)

# setting column index name to None
pivoted_data_2.columns.name = None

pivoted_data_2.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Gross Domestic Product,Gross Fixed Capital Formation,"Value Added (Agriculture, Forestry and Fishing)"
0,1,Armenia,1991,1991,-11.67876,-43.431481,30.278273
1,1,Armenia,1992,1992,-41.756385,-65.285807,17.06742
2,1,Armenia,1993,1993,-8.810834,-33.632451,-5.948145
3,1,Armenia,1994,1994,5.327179,73.27852,-52.281833
4,1,Armenia,1995,1995,6.891187,-14.672296,3.944426


In [29]:
# renaming columns
cleaned_data_2 = pivoted_data_2.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Gross Domestic Product': 'GDP_annual_growth',
        'Gross Fixed Capital Formation': 'GFCF_annual_growth',
        'Value Added (Agriculture, Forestry and Fishing)': 'value_added_ag_forest_fish_annual_growth'
    }
)
cleaned_data_2.head()

Unnamed: 0,area_code,area,year_code,year,GDP_annual_growth,GFCF_annual_growth,value_added_ag_forest_fish_annual_growth
0,1,Armenia,1991,1991,-11.67876,-43.431481,30.278273
1,1,Armenia,1992,1992,-41.756385,-65.285807,17.06742
2,1,Armenia,1993,1993,-8.810834,-33.632451,-5.948145
3,1,Armenia,1994,1994,5.327179,73.27852,-52.281833
4,1,Armenia,1995,1995,6.891187,-14.672296,3.944426


In [30]:
# merging datasets
merged_data_1 = pd.merge(
    cleaned_data_1, cleaned_data_2, 
    on = ['area_code', 'area', 'year_code', 'year'],
    how = 'left'
)

merged_data_1.head()

Unnamed: 0,area_code,area,year_code,year,gross_domestic_product,gross_fixed_capital_formation,value_added_ag_forest_fish,GDP_annual_growth,GFCF_annual_growth,value_added_ag_forest_fish_annual_growth
0,1,Armenia,1990,1990,5747.758431,2693.900711,918.064586,,,
1,1,Armenia,1991,1991,5076.491543,1523.89975,1196.038694,-11.67876,-43.431481,30.278273
2,1,Armenia,1992,1992,2956.732211,529.009513,1400.171646,-41.756385,-65.285807,17.06742
3,1,Armenia,1993,1993,2696.219466,351.09065,1316.887415,-8.810834,-33.632451,-5.948145
4,1,Armenia,1994,1994,2839.851925,608.364684,628.394537,5.327179,73.27852,-52.281833


## Element: Share of GDP US$, 2015 prices

In [8]:
# filter data for required rows
filtered_data_3 = raw_data.loc[
    raw_data['Element']=='Share of GDP US$, 2015 prices'
]

filtered_data_3['Item'].unique()

array(['Gross Fixed Capital Formation',
       'Value Added (Agriculture, Forestry and Fishing)',
       'Value Added (Total Manufacturing)',
       'Value Added (Manufacture of food, beverages and tobacco products)'],
      dtype=object)

In [25]:
# keeping only the desired elements
items_to_keep = [ 
    'Gross Fixed Capital Formation',
    'Value Added (Agriculture, Forestry and Fishing)',
    'Value Added (Manufacture of food, beverages and tobacco products)'
]

filtered_data_3 = filtered_data_3.loc[
    filtered_data_3['Item'].isin(items_to_keep)
]

filtered_data_3['Item'].value_counts()

Item
Gross Fixed Capital Formation                                        12670
Value Added (Agriculture, Forestry and Fishing)                      12597
Value Added (Manufacture of food, beverages and tobacco products)      693
Name: count, dtype: int64

We have very few rows for the indicator- Value Added (Manufacture of food, beverages and tobacco products). 
Thus, we would drop this indicator.

In [26]:
filtered_data_3 = filtered_data_3.loc[
    filtered_data_3['Item']!='Value Added (Manufacture of food, beverages and tobacco products)'
]

filtered_data_3['Item'].value_counts()

Item
Gross Fixed Capital Formation                      12670
Value Added (Agriculture, Forestry and Fishing)    12597
Name: count, dtype: int64

In [27]:
# restructuring data from long-format to wide-format
pivoted_data_3 = filtered_data_3.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# resetting row index
pivoted_data_3.reset_index(inplace=True)

# setting column index name to None
pivoted_data_3.columns.name = None

pivoted_data_3.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Gross Fixed Capital Formation,"Value Added (Agriculture, Forestry and Fishing)"
0,1,Armenia,1990,1990,46.868718,15.972567
1,1,Armenia,1991,1991,30.018759,23.560341
2,1,Armenia,1992,1992,17.891695,47.355376
3,1,Armenia,1993,1993,13.02159,48.841996
4,1,Armenia,1994,1994,21.422409,22.127722


In [28]:
# renaming columns
cleaned_data_3 = pivoted_data_3.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Gross Fixed Capital Formation': 'GFCF_share_in_total_GDP',
        'Value Added (Agriculture, Forestry and Fishing)': 'ag_forest_fish_share_in_total_GDP'
    }
)
cleaned_data_3.head()

Unnamed: 0,area_code,area,year_code,year,GFCF_share_in_total_GDP,ag_forest_fish_share_in_total_GDP
0,1,Armenia,1990,1990,46.868718,15.972567
1,1,Armenia,1991,1991,30.018759,23.560341
2,1,Armenia,1992,1992,17.891695,47.355376
3,1,Armenia,1993,1993,13.02159,48.841996
4,1,Armenia,1994,1994,21.422409,22.127722


In [31]:
# merging datasets
merged_data_2 = pd.merge(
    merged_data_1, cleaned_data_3, 
    on = ['area_code', 'area', 'year_code', 'year'],
    how = 'left'
)

merged_data_2.head()

Unnamed: 0,area_code,area,year_code,year,gross_domestic_product,gross_fixed_capital_formation,value_added_ag_forest_fish,GDP_annual_growth,GFCF_annual_growth,value_added_ag_forest_fish_annual_growth,GFCF_share_in_total_GDP,ag_forest_fish_share_in_total_GDP
0,1,Armenia,1990,1990,5747.758431,2693.900711,918.064586,,,,46.868718,15.972567
1,1,Armenia,1991,1991,5076.491543,1523.89975,1196.038694,-11.67876,-43.431481,30.278273,30.018759,23.560341
2,1,Armenia,1992,1992,2956.732211,529.009513,1400.171646,-41.756385,-65.285807,17.06742,17.891695,47.355376
3,1,Armenia,1993,1993,2696.219466,351.09065,1316.887415,-8.810834,-33.632451,-5.948145,13.02159,48.841996
4,1,Armenia,1994,1994,2839.851925,608.364684,628.394537,5.327179,73.27852,-52.281833,21.422409,22.127722


In [33]:
merged_data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12724 entries, 0 to 12723
Data columns (total 12 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   area_code                                 12724 non-null  int64  
 1   area                                      12724 non-null  object 
 2   year_code                                 12724 non-null  int64  
 3   year                                      12724 non-null  int64  
 4   gross_domestic_product                    12724 non-null  float64
 5   gross_fixed_capital_formation             12670 non-null  float64
 6   value_added_ag_forest_fish                12597 non-null  float64
 7   GDP_annual_growth                         12471 non-null  float64
 8   GFCF_annual_growth                        12418 non-null  float64
 9   value_added_ag_forest_fish_annual_growth  12347 non-null  float64
 10  GFCF_share_in_total_GDP           

In [34]:
# exporting cleaned data as csv file
merged_data_2.to_csv('cleaned_datasets/economic_indicators_cleaned.csv', index='False')