In [103]:
import pandas as pd
import numpy as np
Indonesia=pd.read_csv('https://github.com/PeishanLi/G5055_Practicum_Project2/raw/main/Data/Indonesia.csv')
Guatemala=pd.read_csv('https://github.com/PeishanLi/G5055_Practicum_Project2/raw/main/Data/Guatemala.csv')

In [80]:
def preprocess_for_correlation(df): 
    df_copy  = df.copy()

    # If a column with unique identifying info has nothing, we then will have blanks for those fields - otherwise concatenating the text will be NanNan etc. 
    df_copy = df_copy.fillna('')

    # Concatenated Column with a number of identifiers 
    df_copy['UniqueID'] = df_copy[['SeriesCode','[Sex]',\
                                   '[Deviation Level]', '[Mountain Elevation]',\
                                   '[Parliamentary committees]', '[Mode of transportation]',\
                                   '[Type of speed]', '[Policy instruments]', '[Type of skill]',\
                                   '[Education level]', '[Location]', '[Food Waste Sector]',\
                                   '[Freq]', '[Type of product]', '[Observation Status]',\
                                   '[Type of occupation]','[Name of non-communicable disease]', '[Level/Status]',\
                                   '[Age]', '[Disability status]','[Frequency of Chlorophyll-a concentration]',\
                                   '[Activity]', '[Level of requirement]', '[Quantile]',\
                                   '[IHR Capacity]','[Name of international institution]'\
                                  ]].apply(lambda x: ' '.join(x), axis=1)
    duplicates = df_copy[df_copy.duplicated(subset=['UniqueID','Goal', 'Target', 'Indicator',\
                                      'SeriesCode', 'SeriesDescription','Source','TimePeriod'])]
    duplicates # if you print duplicates (outside of this function) you will find a small number of duplicate columns; 
    # however when I looked at them, the values were largely consistent - the only differing column I could find was Value. 
    # Let me know if you think this should be done differently. 
    
    # For the time being I will remove columns that are duplicates across all columns (less that of 'Values')
    df_copy = df_copy[['UniqueID','Source','Goal', 'Target', 'Indicator', 'SeriesCode', 'SeriesDescription','[Units]','[Nature]',
       'GeoAreaCode', 'GeoAreaName', 'Time_Detail', 'Value','[Reporting Type]','TimePeriod']].drop_duplicates(subset = ['UniqueID','Source','Goal', 'Target', 'Indicator', 'SeriesCode', 'SeriesDescription','[Units]','[Nature]',
       'GeoAreaCode', 'GeoAreaName', 'Time_Detail','[Reporting Type]','TimePeriod'])
    df_copy = df_copy[['UniqueID','Goal', 'Target', 'Indicator', 'SeriesCode','TimePeriod','Value']].drop_duplicates(subset = ['UniqueID', 'Goal', 'Target', 'Indicator', 'SeriesCode', 'TimePeriod'])   
    df_copy = df_copy.dropna()
    df_copy['Value']=pd.to_numeric(df_copy['Value'])
    return df_copy

In [91]:
processedIndo=preprocess_for_correlation(Indonesia)
print(processedIndo)
processedIndo=processedIndo.drop([4232])
print(processedIndo)
#processedIndo.to_csv('/content/drive/MyDrive/Colab Notebooks/Practicum SDG Networks/Data/processedIndo.csv')

                                   UniqueID Goal  ... TimePeriod       Value
0      SI_POV_DAY1                             1  ...       2012     9.50000
1      SI_POV_DAY1                             1  ...       2013     7.30000
2      SI_POV_DAY1                             1  ...       2014     6.20000
3      SI_POV_DAY1                             1  ...       2015     5.80000
4      SI_POV_DAY1                             1  ...       2016     5.20000
...                                     ...  ...  ...        ...         ...
4228  DC_FTA_TOTAL                            17  ...       2016  1053.97416
4229  DC_FTA_TOTAL                            17  ...       2017  1168.49150
4230  DC_FTA_TOTAL                            17  ...       2018  2042.80551
4231  DC_FTA_TOTAL                            17  ...       2019   886.47354
4232                                              ...                    NaN

[4231 rows x 7 columns]
                                   UniqueID Goal  .

In [108]:
def calculate_correlation(df):
  Allindicators=df['Indicator'].drop_duplicates()
  for indicator in Allindicators:
    AllID=list(df['UniqueID'].drop_duplicates())
    if len(AllID)>1:
      group=df[df['Indicator']==indicator]
      group=group.drop(columns=['Goal','Target','Indicator','SeriesCode'],axis=1)
      group_pivot=group.pivot(index=['TimePeriod'],columns=['UniqueID'],values='Value').reset_index()
      group_pivot=group_pivot.apply(lambda x:x.astype(float))
      correlation=group_pivot.corr()
      path='/content/drive/MyDrive/Colab Notebooks/Practicum SDG Networks/Data/Indonesia Correlation among measurements/Indonesia Correlation among measurements of Indicator '+indicator+'.csv'
      correlation.to_csv(path)
      print('Indicator '+indicator+' completed')

In [109]:
calculate_correlation(processedIndo)

Indicator 1.1.1 completed
Indicator 1.2.1 completed
Indicator 1.3.1 completed
Indicator 1.4.1 completed
Indicator 1.4.2 completed
Indicator 1.5.2 completed
Indicator 1.5.1 completed
Indicator 1.5.3 completed
Indicator 1.5.4 completed
Indicator 1.a.2 completed
Indicator 1.a.1 completed
Indicator 2.1.1 completed
Indicator 2.1.2 completed
Indicator 2.2.1 completed
Indicator 2.2.2 completed
Indicator 2.2.3 completed
Indicator 2.3.1 completed
Indicator 2.3.2 completed
Indicator 2.5.1 completed
Indicator 2.5.2 completed
Indicator 2.a.1 completed
Indicator 2.a.2 completed
Indicator 2.b.1 completed
Indicator 2.c.1 completed
Indicator 3.1.2 completed
Indicator 3.1.1 completed
Indicator 3.2.1 completed
Indicator 3.2.2 completed
Indicator 3.3.3 completed
Indicator 3.3.5 completed
Indicator 3.3.2 completed
Indicator 3.3.4 completed
Indicator 3.4.1 completed
Indicator 3.4.2 completed
Indicator 3.5.2 completed
Indicator 3.5.1 completed
Indicator 3.6.1 completed
Indicator 3.7.1 completed
Indicator 3.

In [106]:
processedGuate=preprocess_for_correlation(Guatemala)
print(processedGuate)
processedGuate=processedGuate.drop([4794])
print(processedGuate)
#processedGuate.to_csv('/content/drive/MyDrive/Colab Notebooks/Practicum SDG Networks/Data/processedGuate.csv')

                                            UniqueID Goal  ... TimePeriod      Value
0               SI_POV_DAY1                             1  ...       2014    8.80000
1      SI_POV_EMP1 FEMALE                 15+           1  ...       2012    7.90000
2      SI_POV_EMP1 FEMALE                 25+           1  ...       2012    7.00000
3     SI_POV_EMP1 BOTHSEX                 25+           1  ...       2012    4.30000
4     SI_POV_EMP1 BOTHSEX                 15+           1  ...       2012    6.30000
...                                              ...  ...  ...        ...        ...
4790           DC_FTA_TOTAL                            17  ...       2016   77.71732
4791           DC_FTA_TOTAL                            17  ...       2017   94.32348
4792           DC_FTA_TOTAL                            17  ...       2018  289.73249
4793           DC_FTA_TOTAL                            17  ...       2019   61.08202
4794                                                       ...   

In [111]:
def calculate_correlation(df):
  Allindicators=df['Indicator'].drop_duplicates()
  for indicator in Allindicators:
    AllID=list(df['UniqueID'].drop_duplicates())
    if len(AllID)>1:
      group=df[df['Indicator']==indicator]
      group=group.drop(columns=['Goal','Target','Indicator','SeriesCode'],axis=1)
      group_pivot=group.pivot(index=['TimePeriod'],columns=['UniqueID'],values='Value').reset_index()
      group_pivot=group_pivot.apply(lambda x:x.astype(float))
      correlation=group_pivot.corr()
      path='/content/drive/MyDrive/Colab Notebooks/Practicum SDG Networks/Data/Guatemala Correlation among measurements/Guatemala Correlation among measurements of Indicator '+indicator+'.csv'
      correlation.to_csv(path)
      print('Indicator '+indicator+' completed')

In [113]:
calculate_correlation(processedGuate)

Indicator 1.1.1 completed
Indicator 1.2.1 completed
Indicator 1.2.2 completed
Indicator 1.3.1 completed
Indicator 1.4.1 completed
Indicator 1.5.2 completed
Indicator 1.5.1 completed
Indicator 1.5.3 completed
Indicator 1.5.4 completed
Indicator 1.a.2 completed
Indicator 1.a.1 completed
Indicator 2.1.1 completed
Indicator 2.1.2 completed
Indicator 2.2.1 completed
Indicator 2.2.2 completed
Indicator 2.2.3 completed
Indicator 2.3.1 completed
Indicator 2.3.2 completed
Indicator 2.5.1 completed
Indicator 2.5.2 completed
Indicator 2.a.1 completed
Indicator 2.a.2 completed
Indicator 2.c.1 completed
Indicator 3.1.2 completed
Indicator 3.1.1 completed
Indicator 3.2.1 completed
Indicator 3.2.2 completed
Indicator 3.3.1 completed
Indicator 3.3.3 completed
Indicator 3.3.5 completed
Indicator 3.3.2 completed
Indicator 3.3.4 completed
Indicator 3.4.1 completed
Indicator 3.4.2 completed
Indicator 3.5.2 completed
Indicator 3.5.1 completed
Indicator 3.6.1 completed
Indicator 3.7.1 completed
Indicator 3.