In [None]:
# required packages
import pandas as pd
import numpy as np

In [6]:
# Loading original data
raw_data = pd.read_csv('Datasets_MS_Project/Credit_to_Agriculture/Investment_CreditAgriculture_E_All_Data_(Normalized)/Investment_CreditAgriculture_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2006,2006,million SLC,14520.67225,X
1,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2007,2007,million SLC,28140.40959,X
2,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2008,2008,million SLC,40000.0,A
3,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2009,2009,million SLC,49673.29589,X
4,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2010,2010,million SLC,63646.368,A


In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58485 entries, 0 to 58484
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area Code        58485 non-null  int64  
 1   Area Code (M49)  58485 non-null  object 
 2   Area             58485 non-null  object 
 3   Item Code        58485 non-null  int64  
 4   Item             58485 non-null  object 
 5   Element Code     58485 non-null  int64  
 6   Element          58485 non-null  object 
 7   Year Code        58485 non-null  int64  
 8   Year             58485 non-null  int64  
 9   Unit             55455 non-null  object 
 10  Value            58485 non-null  float64
 11  Flag             58485 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 5.4+ MB


In [8]:
# Finding unique values of Element column
raw_data['Element'].unique()

array(['Value Standard Local Currency',
       'Value Standard Local Currency, 2015 prices', 'Value US$',
       'Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices',
       'Agriculture orientation index US$, 2015 prices'], dtype=object)

In [9]:
raw_data['Item'].unique()

array(['Total Credit', 'Credit to Agriculture, Forestry and Fishing',
       'Credit to Agriculture', 'Credit to Agriculture and Forestry',
       'Credit to Fishery', 'Credit to Agriculture and Fishery',
       'Credit to Forestry', 'Credit to Forestry and Fishery'],
      dtype=object)

For our project we would consider just one of the items, among following: 
"Credit to Agriculture", and "Credit to Agriculture, Forestry and Fishing".
It's 'Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices', 
'Agriculture orientation index US$, 2015 prices'. To keep the currency units 
standard across all the datasets, we are not considering values in local currencies. 
So, we would filter the raw_data accordingly.

In [None]:
# finding the amount of data avilable for each of the items
raw_data['Item'].value_counts()

Item
Credit to Agriculture, Forestry and Fishing    21154
Total Credit                                   17757
Credit to Agriculture                           6755
Credit to Fishery                               3674
Credit to Agriculture and Fishery               3288
Credit to Agriculture and Forestry              2504
Credit to Forestry                              1677
Credit to Forestry and Fishery                  1676
Name: count, dtype: int64

In [None]:
# proportion of each unique value in "Item" column
raw_data['Item'].value_counts(normalize=True)

Item
Credit to Agriculture, Forestry and Fishing    0.361700
Total Credit                                   0.303616
Credit to Agriculture                          0.115500
Credit to Fishery                              0.062820
Credit to Agriculture and Fishery              0.056220
Credit to Agriculture and Forestry             0.042814
Credit to Forestry                             0.028674
Credit to Forestry and Fishery                 0.028657
Name: proportion, dtype: float64

For our project, we would consider "Credit to Agriculture, Forestry and Fishing" 
because for this item, we have greater amount of data available among all the items. 

In [38]:
# Filter the data
elements_to_keep = [
    'Value US$, 2015 prices',
    'Share of Total Credit US$, 2015 prices', 
    'Agriculture orientation index US$, 2015 prices'
    ]

filtered_data = raw_data.loc[
    (raw_data['Item']=='Credit to Agriculture, Forestry and Fishing') &
    (raw_data['Element'].isin(elements_to_keep))
    ]

filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11076 entries, 96 to 58484
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area Code        11076 non-null  int64  
 1   Area Code (M49)  11076 non-null  object 
 2   Area             11076 non-null  object 
 3   Item Code        11076 non-null  int64  
 4   Item             11076 non-null  object 
 5   Element Code     11076 non-null  int64  
 6   Element          11076 non-null  object 
 7   Year Code        11076 non-null  int64  
 8   Year             11076 non-null  int64  
 9   Unit             8046 non-null   object 
 10  Value            11076 non-null  float64
 11  Flag             11076 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.1+ MB


In [43]:
# finding units for the elements
unique_units = filtered_data[['Element', 'Unit']]
unique_units.reset_index(inplace=True)

pairs = {}
for i in range(len(unique_units)):
    pairs[unique_units['Element'][i]] = unique_units['Unit'][i]

unique_units_df = pd.DataFrame(list(pairs.items()), columns = ['Element', 'Unit'])
unique_units_df

Unnamed: 0,Element,Unit
0,"Value US$, 2015 prices",million USD
1,"Share of Total Credit US$, 2015 prices",%
2,"Agriculture orientation index US$, 2015 prices",


In [39]:
filtered_data['Element'].unique()

array(['Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices',
       'Agriculture orientation index US$, 2015 prices'], dtype=object)

In [None]:
# Restructuring data from long format to wide format
pivoted_data = filtered_data.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Element',
    values = 'Value'
)

# Resetting row index
pivoted_data.reset_index(inplace=True)

# Setting column index to None
pivoted_data.columns.name = None

pivoted_data.head()

Unnamed: 0,Area Code,Area,Year Code,Year,"Agriculture orientation index US$, 2015 prices","Share of Total Credit US$, 2015 prices","Value US$, 2015 prices"
0,1,Armenia,2000,2000,1.0,16.170246,36.830029
1,1,Armenia,2001,2001,1.0,15.957212,32.00798
2,1,Armenia,2002,2002,1.0,13.038528,26.333912
3,1,Armenia,2003,2003,1.0,12.554479,26.365699
4,1,Armenia,2004,2004,0.0,6.574773,26.19609


In [44]:
# Renaming the columns
cleaned_data = pivoted_data.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Value US$, 2015 prices': 'credit_to_ag_forest_fish_2015_USD',
        'Share of Total Credit US$, 2015 prices': 'credit_to_ag_forest_fish_share_totalCredit',
        'Agriculture orientation index US$, 2015 prices': 'Agri_orientation_index_2015_USD'
    }
)
cleaned_data.head()

Unnamed: 0,area_code,area,year_code,year,Agri_orientation_index_2015_USD,credit_to_ag_forest_fish_share_totalCredit,credit_to_ag_forest_fish_2015_USD
0,1,Armenia,2000,2000,1.0,16.170246,36.830029
1,1,Armenia,2001,2001,1.0,15.957212,32.00798
2,1,Armenia,2002,2002,1.0,13.038528,26.333912
3,1,Armenia,2003,2003,1.0,12.554479,26.365699
4,1,Armenia,2004,2004,0.0,6.574773,26.19609


In [45]:
# Exporting cleaned data to a csv file 
cleaned_data.to_csv('cleaned_datasets/credit_to_agri_forestry_fishery_cleaned.csv', index=False, encoding='utf-8')