In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading data
raw_data = pd.read_csv('Datasets_MS_Project/Foreign_Direct_Investment/Investment_ForeignDirectInvestment_E_All_Data_(Normalized)/Investment_ForeignDirectInvestment_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,23082,Total FDI inflows,6110,Value US$,1990,1990,million USD,1e-05,X,UNCTAD
1,2,'004,Afghanistan,23082,Total FDI inflows,6110,Value US$,1991,1991,million USD,-0.28,X,UNCTAD
2,2,'004,Afghanistan,23082,Total FDI inflows,6110,Value US$,1992,1992,million USD,0.36,X,UNCTAD
3,2,'004,Afghanistan,23082,Total FDI inflows,6110,Value US$,1993,1993,million USD,-0.02,X,UNCTAD
4,2,'004,Afghanistan,23082,Total FDI inflows,6110,Value US$,1994,1994,million USD,0.02,X,UNCTAD


In [3]:
raw_data.shape

(40442, 13)

In [4]:
raw_data['Item'].unique()

array(['Total FDI inflows', 'Total FDI outflows',
       'FDI inflows to Agriculture, Forestry and Fishing',
       'FDI inflows to Food, Beverages and Tobacco',
       'FDI outflows to Agriculture, Forestry and Fishing',
       'FDI outflows to Food, Beverages and Tobacco'], dtype=object)

In [5]:
raw_data['Element'].unique()

array(['Value US$', 'Value US$, 2015 prices',
       'Share of Total FDI inflows US$, 2015 prices',
       'Share of Total FDI outflows US$, 2015 prices'], dtype=object)

For our project, we would consider just the FDI inflows because investments 
coming into the country from foreign entities are generally more significant to 
farmer producer prices. 

Moreover, two kinds of meterics will be used to measure FDI inflows: 
'Value US$, 2015 prices', and 'Share of Total FDI inflows US$, 2015 prices'

In [6]:
# Filter the dataset for desired items and elements
items_to_keep = [
    'Total FDI inflows', 
    'FDI inflows to Agriculture, Forestry and Fishing', 
    'FDI inflows to Food, Beverages and Tobacco'
]

elements_to_keep = [
    'Value US$, 2015 prices',
    'Share of Total FDI inflows US$, 2015 prices'
]

filtered_data = raw_data.loc[
    (raw_data['Item'].isin(items_to_keep)) &
    (raw_data['Element'].isin(elements_to_keep))
]
filtered_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
32,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1990,1990,million USD,2.4e-05,X,UNCTAD
33,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1991,1991,million USD,-0.652408,X,UNCTAD
34,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1992,1992,million USD,0.819359,X,UNCTAD
35,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1993,1993,million USD,-0.044542,X,UNCTAD
36,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1994,1994,million USD,0.04363,X,UNCTAD


In [7]:
filtered_data['Item'].unique()

array(['Total FDI inflows',
       'FDI inflows to Agriculture, Forestry and Fishing',
       'FDI inflows to Food, Beverages and Tobacco'], dtype=object)

In [8]:
filtered_data['Element'].unique()

array(['Value US$, 2015 prices',
       'Share of Total FDI inflows US$, 2015 prices'], dtype=object)

In [13]:
filtered_data['Item'].value_counts()

Item
Total FDI inflows                                   7666
FDI inflows to Agriculture, Forestry and Fishing    3302
FDI inflows to Food, Beverages and Tobacco          2310
Name: count, dtype: int64

We can make two separate datasets based on the two elements- value in USD and 
share towards total FDI (%). Later, we can combine these two datasets to make 
an integrated dataset.

## FDI in terms of USD, 2015 prices

In [9]:
filtered_data_1 = filtered_data.loc[filtered_data['Element']=='Value US$, 2015 prices']
filtered_data_1.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
32,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1990,1990,million USD,2.4e-05,X,UNCTAD
33,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1991,1991,million USD,-0.652408,X,UNCTAD
34,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1992,1992,million USD,0.819359,X,UNCTAD
35,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1993,1993,million USD,-0.044542,X,UNCTAD
36,2,'004,Afghanistan,23082,Total FDI inflows,6184,"Value US$, 2015 prices",1994,1994,million USD,0.04363,X,UNCTAD


In [10]:
filtered_data_1['Element'].unique()

array(['Value US$, 2015 prices'], dtype=object)

In [12]:
filtered_data_1['Item'].value_counts()

Item
Total FDI inflows                                   7666
FDI inflows to Agriculture, Forestry and Fishing    1651
FDI inflows to Food, Beverages and Tobacco          1155
Name: count, dtype: int64

In [None]:
# restructuring data from long to wide format
pivoted_data_1 = filtered_data_1.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# resetting row index
pivoted_data_1.reset_index(inplace=True)

# setting column index name to None
pivoted_data_1.columns.name = None

pivoted_data_1.head()

Unnamed: 0,Area Code,Area,Year Code,Year,"FDI inflows to Agriculture, Forestry and Fishing","FDI inflows to Food, Beverages and Tobacco",Total FDI inflows
0,1,Armenia,1990,1990,,,9.738913
1,1,Armenia,1991,1991,,,47.391556
2,1,Armenia,1992,1992,,,5.667964
3,1,Armenia,1993,1993,,,1.845575
4,1,Armenia,1994,1994,,,18.069882


In [15]:
# renaming columns
cleaned_data_1 = pivoted_data_1.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'FDI inflows to Agriculture, Forestry and Fishing': 'FDI_ag_forest_fish',
        'FDI inflows to Food, Beverages and Tobacco': 'FDI_food_industry',
        'Total FDI inflows': 'total_FDI_inflows'
    }
)
cleaned_data_1.head()

Unnamed: 0,area_code,area,year_code,year,FDI_ag_forest_fish,FDI_food_industry,total_FDI_inflows
0,1,Armenia,1990,1990,,,9.738913
1,1,Armenia,1991,1991,,,47.391556
2,1,Armenia,1992,1992,,,5.667964
3,1,Armenia,1993,1993,,,1.845575
4,1,Armenia,1994,1994,,,18.069882


## FDI in terms of 'Share of Total FDI inflows US$, 2015 prices'

In [16]:
filtered_data_2 = filtered_data.loc[filtered_data['Element']=='Share of Total FDI inflows US$, 2015 prices']
filtered_data_2.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
134,3,'008,Albania,23080,"FDI inflows to Agriculture, Forestry and Fishing",61410,"Share of Total FDI inflows US$, 2015 prices",2004,2004,%,0.190478,X,
135,3,'008,Albania,23080,"FDI inflows to Agriculture, Forestry and Fishing",61410,"Share of Total FDI inflows US$, 2015 prices",2005,2005,%,0.188456,X,
136,3,'008,Albania,23080,"FDI inflows to Agriculture, Forestry and Fishing",61410,"Share of Total FDI inflows US$, 2015 prices",2006,2006,%,0.772201,X,
137,3,'008,Albania,23080,"FDI inflows to Agriculture, Forestry and Fishing",61410,"Share of Total FDI inflows US$, 2015 prices",2007,2007,%,0.416667,X,
138,3,'008,Albania,23080,"FDI inflows to Agriculture, Forestry and Fishing",61410,"Share of Total FDI inflows US$, 2015 prices",2008,2008,%,-8.120301,X,


In [17]:
filtered_data_2['Item'].value_counts()

Item
FDI inflows to Agriculture, Forestry and Fishing    1651
FDI inflows to Food, Beverages and Tobacco          1155
Name: count, dtype: int64

In [None]:
# restructuring data from long to wide format
pivoted_data_2 = filtered_data_2.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# resetting row index
pivoted_data_2.reset_index(inplace=True)

# setting column index name to None
pivoted_data_2.columns.name = None

pivoted_data_2.head()

Unnamed: 0,Area Code,Area,Year Code,Year,"FDI inflows to Agriculture, Forestry and Fishing","FDI inflows to Food, Beverages and Tobacco"
0,1,Armenia,2000,2000,,-9.788868
1,1,Armenia,2001,2001,0.286123,4.005722
2,1,Armenia,2002,2002,3.068592,5.956679
3,1,Armenia,2003,2003,,6.365964
4,1,Armenia,2004,2004,,14.767661


In [19]:
# renaming columns
cleaned_data_2 = pivoted_data_2.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'FDI inflows to Agriculture, Forestry and Fishing': 'FDI_ag_forest_fish_share',
        'FDI inflows to Food, Beverages and Tobacco': 'FDI_food_industry_share'
    }
)
cleaned_data_2.head()

Unnamed: 0,area_code,area,year_code,year,FDI_ag_forest_fish_share,FDI_food_industry_share
0,1,Armenia,2000,2000,,-9.788868
1,1,Armenia,2001,2001,0.286123,4.005722
2,1,Armenia,2002,2002,3.068592,5.956679
3,1,Armenia,2003,2003,,6.365964
4,1,Armenia,2004,2004,,14.767661


In [20]:
cleaned_data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   area_code                 1887 non-null   int64  
 1   area                      1887 non-null   object 
 2   year_code                 1887 non-null   int64  
 3   year                      1887 non-null   int64  
 4   FDI_ag_forest_fish_share  1651 non-null   float64
 5   FDI_food_industry_share   1155 non-null   float64
dtypes: float64(2), int64(3), object(1)
memory usage: 88.6+ KB


In [21]:
cleaned_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7666 entries, 0 to 7665
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   area_code           7666 non-null   int64  
 1   area                7666 non-null   object 
 2   year_code           7666 non-null   int64  
 3   year                7666 non-null   int64  
 4   FDI_ag_forest_fish  1651 non-null   float64
 5   FDI_food_industry   1155 non-null   float64
 6   total_FDI_inflows   7666 non-null   float64
dtypes: float64(3), int64(3), object(1)
memory usage: 419.4+ KB


## Combining two datasets

In [23]:
merged_data = pd.merge(
    cleaned_data_1, cleaned_data_2, 
    on=['area_code', 'area', 'year_code', 'year'],
    how = 'left'
)

merged_data.head()

Unnamed: 0,area_code,area,year_code,year,FDI_ag_forest_fish,FDI_food_industry,total_FDI_inflows,FDI_ag_forest_fish_share,FDI_food_industry_share
0,1,Armenia,1990,1990,,,9.738913,,
1,1,Armenia,1991,1991,,,47.391556,,
2,1,Armenia,1992,1992,,,5.667964,,
3,1,Armenia,1993,1993,,,1.845575,,
4,1,Armenia,1994,1994,,,18.069882,,


In [24]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7666 entries, 0 to 7665
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   area_code                 7666 non-null   int64  
 1   area                      7666 non-null   object 
 2   year_code                 7666 non-null   int64  
 3   year                      7666 non-null   int64  
 4   FDI_ag_forest_fish        1651 non-null   float64
 5   FDI_food_industry         1155 non-null   float64
 6   total_FDI_inflows         7666 non-null   float64
 7   FDI_ag_forest_fish_share  1651 non-null   float64
 8   FDI_food_industry_share   1155 non-null   float64
dtypes: float64(5), int64(3), object(1)
memory usage: 539.1+ KB


In [25]:
# exporting cleaned data as csv file
merged_data.to_csv('cleaned_datasets/foreign_investment_cleaned.csv', index='False')