In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

In [2]:
# to make a list of csvs titles that will be nested together
# dl = download

#to read csv of data

## data source: OECD
dl_avg_annual_hrs_worked_df = pd.read_csv('../data/OECD_Avg_annual_hours_worked_per_worker_original_file.csv')
dl_avg_annual_wages_df = pd.read_csv('../data/OECD_Average_annual_wages_original_file.csv')
dl_causes_of_mortality_df = pd.read_csv('../data/OECD_Causes_of_mortality_original_file.csv')
dl_composite_leading_indicators_df = pd.read_csv('../data/OECD_Compsite_Leading_Indicators_original_file.csv')


In [3]:
# to rename the 'value' column to indicate the content

# list of column headers by df 
value_cols = ['Avg. Work Hours (Annual)','Avg. Wages (Annual)','Mortality Causes','CLI Values (Monthly)']
# dict of dataframes 
oecd_dfs = {0: dl_avg_annual_hrs_worked_df, 
            1: dl_avg_annual_wages_df,
            2: dl_causes_of_mortality_df, 
            3: dl_composite_leading_indicators_df
           }

## lists for editing dataframes
list_of_cols_to_drop = ["TIME",
                        "YEA",
                        "Unit Code",
                        "EMPSTAT",
                        "Frequency",
                        "FREQUENCY",
                        "Measure",
                        "PowerCode Code",
                        "PowerCode",
                        "SERIES",
                        "Reference Period Code",
                        "Reference Period",
                        "Flag Codes",
                        "SUBJECT",
                        "Reference",
                        "VAR",
                        "Flags"]


In [4]:
i = 0

# iterate through dataframes 
while i < len(oecd_dfs):
    # rename columns per dataframe
    ## all edited dataframes are now stored in oecd_dfs object 
    oecd_dfs[i] = oecd_dfs[i].rename(columns= {"COUNTRY":"COU",
                                 "LOCATION":"COU",
                                 "UNIT":"Unit",
                                 "Currency": "Unit",
                                 "Year":"Time",
                                 "Variable":"Description", 
                                 "Subject":"Description",
                                 "Employment status":"Description",
                                 "Series":"Description"
                                 })
#                                  "Value":value_cols[i]})
    
    oecd_dfs[i]["Dataset"] = value_cols[i]
    
    # drop columns per dataframe
    for item in list_of_cols_to_drop: 
        # test if column exists in dataframe
        if item in oecd_dfs[i].columns:
            oecd_dfs[i] = oecd_dfs[i].drop(columns= item)
        else:
            continue

    
    # repostion columns in dataframe
    # get a list of of columns in the dataframe
    print(f"before columns {list(oecd_dfs[i].columns)}")
    
    # assign to df
    oecd_dfs[i] = oecd_dfs[i][["Dataset","COU","Country","Time","Description","Value","Unit"]]
    
    
    print(f"Completed df {i} of 3")
    print(f"final columns {list(oecd_dfs[i].columns)}")
    print("=========================")   
    
    
    i += 1

before columns ['COU', 'Country', 'Description', 'Time', 'Unit', 'Value', 'Dataset']
Completed df 0 of 3
final columns ['Dataset', 'COU', 'Country', 'Time', 'Description', 'Value', 'Unit']
before columns ['COU', 'Country', 'Description', 'Time', 'Unit', 'Value', 'Dataset']
Completed df 1 of 3
final columns ['Dataset', 'COU', 'Country', 'Time', 'Description', 'Value', 'Unit']
before columns ['Description', 'Unit', 'COU', 'Country', 'Time', 'Value', 'Dataset']
Completed df 2 of 3
final columns ['Dataset', 'COU', 'Country', 'Time', 'Description', 'Value', 'Unit']
before columns ['Description', 'COU', 'Country', 'Time', 'Unit', 'Value', 'Dataset']
Completed df 3 of 3
final columns ['Dataset', 'COU', 'Country', 'Time', 'Description', 'Value', 'Unit']


In [5]:
# to concatenate dataframes
oecd_df = pd.concat([oecd_dfs[(0)],oecd_dfs[(1)],oecd_dfs[(2)],oecd_dfs[(3)]],ignore_index=True)
print(f"Combined all df")
print("=========================")

Combined all df


In [6]:
oecd_df.head()

Unnamed: 0,Dataset,COU,Country,Time,Description,Value,Unit
0,Avg. Work Hours (Annual),AUS,Australia,2000,Total employment,1779.63,Hours
1,Avg. Work Hours (Annual),AUS,Australia,2001,Total employment,1746.37,Hours
2,Avg. Work Hours (Annual),AUS,Australia,2002,Total employment,1741.82,Hours
3,Avg. Work Hours (Annual),AUS,Australia,2003,Total employment,1735.1,Hours
4,Avg. Work Hours (Annual),AUS,Australia,2004,Total employment,1733.74,Hours


In [7]:
oecd_df.tail()

Unnamed: 0,Dataset,COU,Country,Time,Description,Value,Unit
318601,CLI Values (Monthly),RUS,Russia,19-Jan,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.70093,Index
318602,CLI Values (Monthly),RUS,Russia,19-Feb,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.81326,Index
318603,CLI Values (Monthly),RUS,Russia,19-Mar,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.92828,Index
318604,CLI Values (Monthly),RUS,Russia,19-Apr,"OECD Standardised CCI, Amplitude adjusted (Lon...",99.03839,Index
318605,CLI Values (Monthly),RUS,Russia,19-May,"OECD Standardised CCI, Amplitude adjusted (Lon...",99.14531,Index


In [8]:
oecd_df = oecd_df.dropna(axis=0, how='any')

In [10]:
oecd_df.tail()

Unnamed: 0,Dataset,COU,Country,Time,Description,Value,Unit
318601,CLI Values (Monthly),RUS,Russia,19-Jan,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.70093,Index
318602,CLI Values (Monthly),RUS,Russia,19-Feb,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.81326,Index
318603,CLI Values (Monthly),RUS,Russia,19-Mar,"OECD Standardised CCI, Amplitude adjusted (Lon...",98.92828,Index
318604,CLI Values (Monthly),RUS,Russia,19-Apr,"OECD Standardised CCI, Amplitude adjusted (Lon...",99.03839,Index
318605,CLI Values (Monthly),RUS,Russia,19-May,"OECD Standardised CCI, Amplitude adjusted (Lon...",99.14531,Index


In [12]:
oecd_df.to_csv('../resources/OECD_Dataframes.csv', sep=',' , encoding= 'utf-8', index=False)