# QUANDL

Reading the pickle file I can get a dictionary of dictionaries.

Use the reference from `../data/MacrodataFiles11Oct2022/SGE_TradingEconomics_Metadata.csv` to investigate usage.

Key is composed of `SGE/dataset_code - Value`

In [1]:
import pandas as pd
import numpy as np
import unicodedata
from collections import OrderedDict

macro_econo_file= "../data/MacrodataFiles11Oct2022/quandl_datasets.pkl"
macro_econo_df = pd.read_pickle(macro_econo_file)

In [2]:
manifest_df = pd.read_csv("../data/MacrodataFiles11Oct2022/SGE_TradingEconomics_Metadata.csv")
nice_names = {}
char_to_replace = {'-': '_',
                   ' ': '_',
                   '(':'',
                   ')':'',}
# for i, r in manifest_df[["code","name"]].T.to_dict("list").items():
for i, r in manifest_df.T.to_dict("list").items():
    if r[0] in ["ALBRATING", "DZARATING"]:
        continue
    tmp_dataset_name = unicodedata.normalize("NFKD",r[1]).lower().replace(" - ", "-").replace(" -","").translate(str.maketrans(char_to_replace))
    clean_dataset_name = f"quandl_{tmp_dataset_name}"
    nice_names[r[0]] = [clean_dataset_name] + r[2:]

nice_names

{'ANDCUR': ['quandl_andorra',
  'National currency of COUNTRYUnits: LCU ',
  nan,
  nan,
  nan],
 'ASMCUR': ['quandl_american_samoa',
  'National currency of COUNTRYUnits: LCU ',
  nan,
  nan,
  nan],
 'EURSNTX': ['quandl_euro_area_sentix_investors_sentiment',
  'This index published by Sentix tracks investor confidence in the Eurozone.Units:  Source: Sentix',
  nan,
  nan,
  nan],
 'MACDINV': ['quandl_macau_changes_in_inventories',
  'The quarterly change in inventory levels in Macau.Units: MOP THO Constant Prices, NSASource: Statistics and Census Service, Government of Macao SAR',
  nan,
  nan,
  nan],
 'MACEMP': ['quandl_macau_employed_persons',
  'The total number of employed (including self-employed) persons in Macau.Units: Thousand Volume, NSASource: Statistics and Census Service, Government of Macao SAR',
  nan,
  nan,
  nan],
 'MACGBVL': ['quandl_macau_government_budget_value',
  'The balance of the federal budget of Macau. It consists of tax revenues and expenses on government

In [11]:
manifest_df["new_name"] = manifest_df["name"].apply(lambda x: unicodedata.normalize("NFKD",x).lower().replace(" - ", "-").replace(" -","").translate(str.maketrans(char_to_replace)))
manifest_df["new_name"].head()

0    untitled_dataset_2018_05_30_19:16:22
1                                 andorra
2                          american_samoa
3    untitled_dataset_2018_05_30_19:21:08
4    euro_area_sentix_investors_sentiment
Name: new_name, dtype: object

In [2]:
COUNTRY_MAPPER = OrderedDict(
    {
        "AND": "AD",
        "ALB": "AL",
        "AUT": "AT",
        "BIH": "BA",
        "BEL": "BE",
        "BGR": "BG",
        "BLR": "BY",
        "CHE": "CH",
        "SRB": "CS",
        "MNE": "CS",
        "CYP": "CY",
        "CZE": "CZ",
        "DEU": "DE",
        "DNK": "FO",
        "EST": "EE",
        "ESP": "ES",
        "FIN": "FI",
        "FRA": "FR",
        "GBR": "UK",
        "GRC": "GR",
        "HRV": "HR",
        "HUN": "HU",
        "IRL": "IE",
        "ISL": "IS",
        "ITA": "VA",
        "LIE": "LI",
        "LTU": "LT",
        "LUX": "LU",
        "LVA": "LV",
        "MCO": "MC",
        "MDA": "MD",
        "MKD": "MK",
        "MLT": "MT",
        "NLD": "NL",
        "NOR": "SJ",
        "POL": "PL",
        "PRT": "PT",
        "ROU": "RO",
        "RUS": "RU",
        "SWE": "SE",
        "SVN": "SI",
        "SVK": "SK",
        "TUR": "TR",
        "UKR": "UA",
        "EUR": "XC",
    }
)

manifest_df = pd.read_csv("../data/MacrodataFiles11Oct2022/SGE_TradingEconomics_Metadata.csv")

def prepare_manifest(manifest_df):
    """
    Create a nice CSV file where there is the mapping between QUANDL code and its description.

    :param manifest_df: raw manifest data associated to QUANDL data.
    :return df: final dataframe to be stored.
    """
    char_to_replace = {
        "-": "_",
        " ": "_",
        "(": "",
        ")": "",
    }
    manifest_df["name"] = manifest_df["name"].apply(
        lambda x: unicodedata.normalize("NFKD", x)
        .lower()
        .replace(" - ", "-")
        .replace(" -", "")
        .translate(str.maketrans(char_to_replace))
    )
    manifest_df["quandl_country"] = manifest_df["code"].str[:3]
    manifest_df["ed_country"] = manifest_df["code"].str[:3]
    manifest_df.replace({"ed_country": COUNTRY_MAPPER}, inplace=True)
    return manifest_df

new_df = prepare_manifest(manifest_df)
new_df.head()

Unnamed: 0,code,name,description,refreshed_at,from_date,to_date,quandl_country,ed_country
0,ALBRATING,untitled_dataset_2018_05_30_19:16:22,This dataset has no description.,,,,ALB,AL
1,ANDCUR,andorra,National currency of COUNTRYUnits: LCU,,,,AND,AD
2,ASMCUR,american_samoa,National currency of COUNTRYUnits: LCU,,,,ASM,ASM
3,DZARATING,untitled_dataset_2018_05_30_19:21:08,This dataset has no description.,,,,DZA,DZA
4,EURSNTX,euro_area_sentix_investors_sentiment,This index published by Sentix tracks investor...,,,,EUR,XC


In [13]:
new_df[(new_df.code == "SRB&MNEFINF")]

Unnamed: 0,code,name,description,refreshed_at,from_date,to_date,quandl_country,ed_country


In [22]:
def prepare_dataset(ds_code, data):
    """
    Clean up key values and make a dataframe out of it for storage.

    :param ds_code: QUANDL code of the dataset.
    :param data: dictionary with data.
    :return df: final dataframe to be stored
    """
    new_data = []
    if len(data) == 0:
        return None
    for k, v in data.items():
        tmp_dict = {}
        tmp_dict["name"] = ds_code
        # tmp_dict["date_time"] = k.to_pydatetime().strftime("%Y-%m-%d")
        tmp_dict["date_time"] = k.to_pydatetime().date()
        # tmp_dict["year"] = k.to_pydatetime().strftime("%Y")
        tmp_dict["value"] = v
        new_data.append(tmp_dict)
    return pd.DataFrame(new_data)

list_dfs=[]
for k, d in macro_econo_df.items():
    # TODO: The SGE label could not be always applicable. Need to re-engineer this part when/if new data from QUANDL is retrieved.
    code = k.replace("SGE/", "").replace(" - Value", "")
    if code in ["ALBRATING", "DZARATING"]:
        # Skip these datasets since there is no description for them.
        continue
    tmp_df = prepare_dataset(code, d)
    if tmp_df is None:
        continue
    list_dfs.append(tmp_df)
df = pd.concat(list_dfs)
df.head()

Unnamed: 0,name,date_time,value
0,ANDGPC,2009-12-31,41860.24
1,ANDGPC,2010-12-31,39627.66
2,ANDGPC,2011-12-31,38038.88
3,ANDGPC,2012-12-31,37967.8
4,ANDGPC,2013-12-31,38715.36


In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()

spark_df = spark.createDataFrame(df)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [25]:
spark_df.select("name").distinct().show(300)

+---------+
|     name|
+---------+
|   ANDGPC|
|    ALBCF|
|  ALBPITR|
|  AUTCNCN|
|   AUTFDI|
|    BGRLC|
|   CYPGGR|
|   CZEREM|
| ESTCCONF|
|   GRCUNR|
|   IRLGNP|
|    MKDGD|
|  NLDCPIC|
|    SVKLC|
|   ALBDIR|
|   BGRSTR|
|    CHEM0|
|    CHECI|
|  CYPGOLD|
|   CYPEMP|
|   DNKFER|
|  ESTJVAC|
|    FRACF|
|  FRAUNRY|
|   GBRFDI|
| GRCCPICM|
| MLTIMVOL|
|   SWERSM|
|   SWERSY|
|  SWEGBVL|
|  SWEEXPX|
|   TURGCP|
| BELIMVOL|
| CHECPICM|
|    ESTIP|
|   ESTFDI|
|  ESPFACT|
|   FRACPI|
|  FRASSRE|
|   HRVFDI|
|   HUNEMP|
|  HUNGPCP|
|  LTUPSAV|
|  LUXWAGE|
|   SWEGSP|
|  SVKIMPX|
| TURBCONF|
|  UKRGAGR|
|  AUTSSRC|
|  CYPDINV|
| FRAIMVOL|
|   FRAPPI|
|  GBRLUNR|
|   GBRGDG|
|   HUNBLR|
|  IRLGPCP|
| NORCPICM|
|   SWECPI|
| SWEIMVOL|
|  SVNGBVL|
| ALBBCONF|
|  CYPCARS|
|    CYPCF|
| CZEEXVOL|
| DEUDPINC|
|  DNKPITR|
|   ESTMKT|
|  ESPLUNR|
|  GBRFACT|
|  HRVCNCN|
|  HRVWAGE|
|  MLTSSRC|
|    ROUIR|
|  ROUSSRE|
|    SWENO|
|    SVKM3|
|   TURGDG|
|  AUTCARS|
|  BGRSSRC|
|  BLRWAGE|
|   