In [3]:
# required packages
import pandas as pd
import numpy as np

In [4]:
# Loading original data
raw_data = pd.read_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/raw_datasets/Credit_to_Agriculture/Investment_CreditAgriculture_E_All_Data_(Normalized)/Investment_CreditAgriculture_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2006,2006,million SLC,14520.67225,X
1,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2007,2007,million SLC,28140.40959,X
2,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2008,2008,million SLC,40000.0,A
3,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2009,2009,million SLC,49673.29589,X
4,2,'004,Afghanistan,23018,Total Credit,6224,Value Standard Local Currency,2010,2010,million SLC,63646.368,A


In [8]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58485 entries, 0 to 58484
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area Code        58485 non-null  int64  
 1   Area Code (M49)  58485 non-null  object 
 2   Area             58485 non-null  object 
 3   Item Code        58485 non-null  int64  
 4   Item             58485 non-null  object 
 5   Element Code     58485 non-null  int64  
 6   Element          58485 non-null  object 
 7   Year Code        58485 non-null  int64  
 8   Year             58485 non-null  int64  
 9   Unit             55455 non-null  object 
 10  Value            58485 non-null  float64
 11  Flag             58485 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 5.4+ MB


In [9]:
# Finding unique values of Element column
raw_data['Element'].unique()

array(['Value Standard Local Currency',
       'Value Standard Local Currency, 2015 prices', 'Value US$',
       'Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices',
       'Agriculture orientation index US$, 2015 prices'], dtype=object)

In [10]:
raw_data['Item'].unique()

array(['Total Credit', 'Credit to Agriculture, Forestry and Fishing',
       'Credit to Agriculture', 'Credit to Agriculture and Forestry',
       'Credit to Fishery', 'Credit to Agriculture and Fishery',
       'Credit to Forestry', 'Credit to Forestry and Fishery'],
      dtype=object)

For our project we would consider just one of the items, among following: 
"Credit to Agriculture", and "Credit to Agriculture, Forestry and Fishing".
It's 'Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices', 
'Agriculture orientation index US$, 2015 prices'. To keep the currency units 
standard across all the datasets, we are not considering values in local currencies. 
So, we would filter the raw_data accordingly.

In [11]:
# finding the amount of data avilable for each of the items
raw_data['Item'].value_counts()

Item
Credit to Agriculture, Forestry and Fishing    21154
Total Credit                                   17757
Credit to Agriculture                           6755
Credit to Fishery                               3674
Credit to Agriculture and Fishery               3288
Credit to Agriculture and Forestry              2504
Credit to Forestry                              1677
Credit to Forestry and Fishery                  1676
Name: count, dtype: int64

In [12]:
# proportion of each unique value in "Item" column
raw_data['Item'].value_counts(normalize=True)

Item
Credit to Agriculture, Forestry and Fishing    0.361700
Total Credit                                   0.303616
Credit to Agriculture                          0.115500
Credit to Fishery                              0.062820
Credit to Agriculture and Fishery              0.056220
Credit to Agriculture and Forestry             0.042814
Credit to Forestry                             0.028674
Credit to Forestry and Fishery                 0.028657
Name: proportion, dtype: float64

For our project, we would consider "Credit to Agriculture, Forestry and Fishing" 
because for this item, we have greater amount of data available among all the items. 

In [5]:
# Filter the data
items_to_keep = [
    'Credit to Agriculture, Forestry and Fishing',
    'Total Credit',
    'Credit to Agriculture',
    'Credit to Agriculture and Fishery',
    'Credit to Agriculture and Forestry'
]

elements_to_keep = [
    'Value US$, 2015 prices',
    'Share of Total Credit US$, 2015 prices', 
    'Agriculture orientation index US$, 2015 prices'
    ]

filtered_data = raw_data.loc[
    (raw_data['Item'].isin(items_to_keep)) &
    (raw_data['Element'].isin(elements_to_keep))
    ]

filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19180 entries, 45 to 58484
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area Code        19180 non-null  int64  
 1   Area Code (M49)  19180 non-null  object 
 2   Area             19180 non-null  object 
 3   Item Code        19180 non-null  int64  
 4   Item             19180 non-null  object 
 5   Element Code     19180 non-null  int64  
 6   Element          19180 non-null  object 
 7   Year Code        19180 non-null  int64  
 8   Year             19180 non-null  int64  
 9   Unit             16150 non-null  object 
 10  Value            19180 non-null  float64
 11  Flag             19180 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.9+ MB


In [6]:
# finding units for the elements
unique_units = filtered_data[['Element', 'Unit']]
unique_units.reset_index(inplace=True)

pairs = {}
for i in range(len(unique_units)):
    pairs[unique_units['Element'][i]] = unique_units['Unit'][i]

unique_units_df = pd.DataFrame(list(pairs.items()), columns = ['Element', 'Unit'])
unique_units_df

Unnamed: 0,Element,Unit
0,"Value US$, 2015 prices",million USD
1,"Share of Total Credit US$, 2015 prices",%
2,"Agriculture orientation index US$, 2015 prices",


In [39]:
filtered_data['Element'].unique()

array(['Value US$, 2015 prices', 'Share of Total Credit US$, 2015 prices',
       'Agriculture orientation index US$, 2015 prices'], dtype=object)

### Credit in terms of 2015 US$ prices

In [7]:
filtered_data_1 = filtered_data.loc[
    filtered_data['Element']=='Value US$, 2015 prices'
]
filtered_data_1['Item'].value_counts()

Item
Total Credit                                   4966
Credit to Agriculture, Forestry and Fishing    4023
Credit to Agriculture                          1690
Credit to Agriculture and Fishery               822
Credit to Agriculture and Forestry              626
Name: count, dtype: int64

In [8]:
# Restructuring data from long format to wide format
pivoted_data_1 = filtered_data_1.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# Resetting row index
pivoted_data_1.reset_index(inplace=True)

# Setting column index to None
pivoted_data_1.columns.name = None

pivoted_data_1.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Credit to Agriculture,Credit to Agriculture and Fishery,Credit to Agriculture and Forestry,"Credit to Agriculture, Forestry and Fishing",Total Credit
0,1,Armenia,1992,1992,,,,,704.0
1,1,Armenia,1993,1993,,,,,133.0
2,1,Armenia,1994,1994,,,,,185.0
3,1,Armenia,1995,1995,,,,,130.0
4,1,Armenia,1996,1996,,,,,106.0


In [17]:
# Renaming the columns
cleaned_data_1 = pivoted_data_1.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Credit to Agriculture': 'credit_to_agriculture',
        'Credit to Agriculture and Fishery': 'credit_to_ag_and_fish',
        'Credit to Agriculture and Forestry': 'credit_to_ag_and_forest',
        'Credit to Agriculture, Forestry and Fishing': 'credit_to_ag_forest_fish',
        'Total Credit': 'total_credit'
    }
)
cleaned_data_1.head(10)

Unnamed: 0,area_code,area,year_code,year,credit_to_agriculture,credit_to_ag_and_fish,credit_to_ag_and_forest,credit_to_ag_forest_fish,total_credit
0,1,Armenia,1992,1992,,,,,704.0
1,1,Armenia,1993,1993,,,,,133.0
2,1,Armenia,1994,1994,,,,,185.0
3,1,Armenia,1995,1995,,,,,130.0
4,1,Armenia,1996,1996,,,,,106.0
5,1,Armenia,1997,1997,,,,,118.0
6,1,Armenia,1998,1998,,,,,155.0
7,1,Armenia,1999,1999,,,,,189.0
8,1,Armenia,2000,2000,36.830029,,,36.830029,228.0
9,1,Armenia,2001,2001,32.00798,,,32.00798,201.0


In [10]:
# Impute missing values in the 'credit_to_ag_forest_fish' column using a priority order:
# Step 1: Fill NaNs with values from 'credit_to_ag_and_forest' (highest priority)
# Step 2: If still NaN, fill with values from 'credit_to_ag_and_fish'
# Step 3: If still NaN, fill with values from 'credit_to_agriculture' (lowest priority)

cleaned_data_1['credit_to_ag_forest_fish'] = (
    cleaned_data_1['credit_to_ag_forest_fish']
    .fillna(cleaned_data_1['credit_to_ag_and_forest'])
    .fillna(cleaned_data_1['credit_to_ag_and_fish'])
    .fillna(cleaned_data_1['credit_to_agriculture'])
)

cleaned_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4966 entries, 0 to 4965
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   area_code                 4966 non-null   int64  
 1   area                      4966 non-null   object 
 2   year_code                 4966 non-null   int64  
 3   year                      4966 non-null   int64  
 4   credit_to_agriculture     1690 non-null   float64
 5   credit_to_ag_and_fish     822 non-null    float64
 6   credit_to_ag_and_forest   626 non-null    float64
 7   credit_to_ag_forest_fish  4031 non-null   float64
 8   total_credit              4966 non-null   float64
dtypes: float64(5), int64(3), object(1)
memory usage: 349.3+ KB


In [22]:
cleaned_data_1 = cleaned_data_1.drop(
    ['credit_to_agriculture', 'credit_to_ag_and_fish', 'credit_to_ag_and_forest'],
    axis = 1
)
cleaned_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4966 entries, 0 to 4965
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   area_code                 4966 non-null   int64  
 1   area                      4966 non-null   object 
 2   year_code                 4966 non-null   int64  
 3   year                      4966 non-null   int64  
 4   credit_to_ag_forest_fish  4023 non-null   float64
 5   total_credit              4966 non-null   float64
dtypes: float64(2), int64(3), object(1)
memory usage: 232.9+ KB


### Credit in terms of share in total

In [23]:
filtered_data_2 = filtered_data.loc[
    filtered_data['Element']=='Share of Total Credit US$, 2015 prices'
]
filtered_data_2['Item'].value_counts()

Item
Credit to Agriculture, Forestry and Fishing    4023
Name: count, dtype: int64

In [24]:
# Restructuring data from long format to wide format
pivoted_data_2 = filtered_data_2.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# Resetting row index
pivoted_data_2.reset_index(inplace=True)

# Setting column index to None
pivoted_data_2.columns.name = None

pivoted_data_2.head()

Unnamed: 0,Area Code,Area,Year Code,Year,"Credit to Agriculture, Forestry and Fishing"
0,1,Armenia,2000,2000,16.170246
1,1,Armenia,2001,2001,15.957212
2,1,Armenia,2002,2002,13.038528
3,1,Armenia,2003,2003,12.554479
4,1,Armenia,2004,2004,6.574773


In [25]:
# Renaming the columns
cleaned_data_2 = pivoted_data_2.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Credit to Agriculture, Forestry and Fishing': 'credit_to_ag_forest_fish_share',
    }
)
cleaned_data_2.head()

Unnamed: 0,area_code,area,year_code,year,credit_to_ag_forest_fish_share
0,1,Armenia,2000,2000,16.170246
1,1,Armenia,2001,2001,15.957212
2,1,Armenia,2002,2002,13.038528
3,1,Armenia,2003,2003,12.554479
4,1,Armenia,2004,2004,6.574773


In [26]:
# merging datasets
merged_data_1 = pd.merge(
    cleaned_data_1, cleaned_data_2, 
    on = ['area_code', 'area', 'year_code', 'year'],
    how = 'left'
)

merged_data_1.head(10)

Unnamed: 0,area_code,area,year_code,year,credit_to_ag_forest_fish,total_credit,credit_to_ag_forest_fish_share
0,1,Armenia,1992,1992,,704.0,
1,1,Armenia,1993,1993,,133.0,
2,1,Armenia,1994,1994,,185.0,
3,1,Armenia,1995,1995,,130.0,
4,1,Armenia,1996,1996,,106.0,
5,1,Armenia,1997,1997,,118.0,
6,1,Armenia,1998,1998,,155.0,
7,1,Armenia,1999,1999,,189.0,
8,1,Armenia,2000,2000,36.830029,228.0,16.170246
9,1,Armenia,2001,2001,32.00798,201.0,15.957212


### Credit in terms of AOI

In [27]:
filtered_data_3 = filtered_data.loc[
    filtered_data['Element']=='Agriculture orientation index US$, 2015 prices'
]
filtered_data_3['Item'].value_counts()

Item
Credit to Agriculture, Forestry and Fishing    3030
Name: count, dtype: int64

In [28]:
# Restructuring data from long format to wide format
pivoted_data_3 = filtered_data_3.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Item',
    values = 'Value'
)

# Resetting row index
pivoted_data_3.reset_index(inplace=True)

# Setting column index to None
pivoted_data_3.columns.name = None

pivoted_data_3.head()

Unnamed: 0,Area Code,Area,Year Code,Year,"Credit to Agriculture, Forestry and Fishing"
0,1,Armenia,2000,2000,1.0
1,1,Armenia,2001,2001,1.0
2,1,Armenia,2002,2002,1.0
3,1,Armenia,2003,2003,1.0
4,1,Armenia,2004,2004,0.0


In [29]:
# Renaming the columns
cleaned_data_3 = pivoted_data_3.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Credit to Agriculture, Forestry and Fishing': 'AOI_credit_to_ag_forest_fish',
    }
)
cleaned_data_3.head()

Unnamed: 0,area_code,area,year_code,year,AOI_credit_to_ag_forest_fish
0,1,Armenia,2000,2000,1.0
1,1,Armenia,2001,2001,1.0
2,1,Armenia,2002,2002,1.0
3,1,Armenia,2003,2003,1.0
4,1,Armenia,2004,2004,0.0


In [30]:
# merging datasets
merged_data_2 = pd.merge(
    merged_data_1, cleaned_data_3, 
    on = ['area_code', 'area', 'year_code', 'year'],
    how = 'left'
)

merged_data_2.head(10)

Unnamed: 0,area_code,area,year_code,year,credit_to_ag_forest_fish,total_credit,credit_to_ag_forest_fish_share,AOI_credit_to_ag_forest_fish
0,1,Armenia,1992,1992,,704.0,,
1,1,Armenia,1993,1993,,133.0,,
2,1,Armenia,1994,1994,,185.0,,
3,1,Armenia,1995,1995,,130.0,,
4,1,Armenia,1996,1996,,106.0,,
5,1,Armenia,1997,1997,,118.0,,
6,1,Armenia,1998,1998,,155.0,,
7,1,Armenia,1999,1999,,189.0,,
8,1,Armenia,2000,2000,36.830029,228.0,16.170246,1.0
9,1,Armenia,2001,2001,32.00798,201.0,15.957212,1.0


In [36]:
# Exporting cleaned data to a csv file 
merged_data_2.to_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/cleaned_datasets/credit_to_agri_forestry_fishery_cleaned.csv', index=False, encoding='utf-8')