<a href="https://colab.research.google.com/github/wgova/time_series_trade/blob/master/data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
from google.colab import drive
drive.mount('/content/drive')
!pip install oec
import oec
import pandas as pd
import numpy as np
import os, os.path, csv, requests, pathlib
import math
from datetime import datetime

#%matplotlib inline

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [103]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## List of industries to add to the data

In [0]:
def build_call(*args):
    call_url = 'http://atlas.media.mit.edu/'
    for val in args:
        call_url += str(val) + '/'
    return call_url


def request_data(call_url):
    r = requests.get(call_url)
    response_dict = r.json()
    json_list = response_dict['data']  # list of dicts containing data
    return json_list


def get_countries(filename=None):
    call = build_call('attr', 'country')
    json_list = request_data(call)
    if filename is not None:
        data_to_csv(json_list, filename)
    return json_list


def get_products(classification, filename=None):
    call = build_call('attr', classification)
    json_list = request_data(call)
    if filename is not None:
        data_to_csv(json_list, filename)
    return json_list


def get_trade(classification, trade_flow, year, origin, destination,product, filename=None):
    call = build_call(classification, trade_flow, year, origin, destination,
                      product)
    json_list = request_data(call)
    if filename is not None:
        data_to_csv(json_list, filename)
    return json_list


def trade_params(classification, trade_flow, year, origin, destination,product):
    parameters = {'classification': classification,
                  'trade_flow': trade_flow,
                  'year': year,
                  'origin': origin,
                  'destination': destination,
                  'product': product}
    return parameters


def get_header(json_list):
    header = set()
    for dict in json_list:
        header.update(dict.keys())
    header = list(header)
    header.sort()
    return header


def create_csv(json_list, filename):
    with open(filename, 'w') as csvfile:
        header = get_header(json_list)
        cw = csv.writer(csvfile)
        cw.writerow(header)
        for dict in json_list:
            row = dict_to_list(dict, header)
            cw.writerow(row)


def dict_to_list(dict, header):
    row = []
    for field in header:
        if field in dict:
            row.append(str(dict[field]))
        else:
            row.append(None)
    return row

## Generate dataset

In [0]:
trade_codes = ['6519',
               '6531',
               '8471','6571','7849','7810','7842','7764','7723','7522','2924','5419','5417','7763','7711','7188']
PATH = '/content/drive/My Drive/Stellenbosch/Webster'
os.chdir(PATH)
trade_classification = 'sitc'
def countries_and_products():
  countries = oec.get_countries()
  products = oec.get_products(trade_classification)
  create_csv(countries,f'{PATH}/list_countries.csv')
  create_csv(products,f'{PATH}/list_products.csv')

In [0]:
assert os.getcwd() == PATH

In [0]:
countries_and_products()

In [0]:
import os.path
from os import path
def download_data(list_of_codes):
  for i in trade_codes:
    csv_name = f"{trade_classification}-{i}"
    # Set parameters to extract top exporters from the API documentation
    params = {'classification': trade_classification,
          'trade_flow': 'export',
          'year': 'all',
          'origin': 'show', #Set origin to all for comparison
          'destination': 'all',
          'product': i} #Set code for product of interest
    print(f'Data for {i} processed.....')
    oec_data = get_trade(**params)
    # Save the results in CSV file
    if path.exists(PATH):
      create_csv(oec_data,f'{csv_name}.csv')#,locals(),globals())
    else:
      print('Choose destination directory')

In [109]:
download_data(trade_codes)

Data for 6519 processed.....
Data for 6531 processed.....
Data for 8471 processed.....
Data for 6571 processed.....
Data for 7849 processed.....
Data for 7810 processed.....
Data for 7842 processed.....
Data for 7764 processed.....
Data for 7723 processed.....
Data for 7522 processed.....
Data for 2924 processed.....
Data for 5419 processed.....
Data for 5417 processed.....
Data for 7763 processed.....
Data for 7711 processed.....
Data for 7188 processed.....


In [0]:
def create_df():
  trade_data = [i for i in os.listdir(os.getcwd()) if i.endswith('.csv') and 'sitc' in i]
  trade_history = []
  for file in trade_data:
    file_name = pd.read_csv(file)
    trade_history.append(file_name)
  products_dataframe = pd.concat(trade_history)
  return products_dataframe

In [0]:
trade_dframe = create_df()
trade_dframe.columns
droplist = ['export_val_growth_pct',
       'export_val_growth_pct_5', 'export_val_growth_val',
       'export_val_growth_val_5','import_val_growth_pct', 'import_val_growth_pct_5',
       'import_val_growth_val', 'import_val_growth_val_5','sitc_id_len']
trade_dframe.drop(droplist,axis=1,inplace=True)

In [112]:
trade_dframe.fillna(0,inplace=True)
trade_dframe["year"] = pd.to_datetime(trade_dframe['year'], format='%Y')
trade_dframe.head()

Unnamed: 0,export_rca,export_val,import_rca,import_val,origin_id,sitc_id,year
0,0.020916,3000.0,0.0,24000.0,afciv,606519,1962-01-01
1,0.025602,6000.0,0.0,21000.0,afcod,606519,1962-01-01
2,0.001952,1000.0,0.0,207000.0,afdza,606519,1962-01-01
3,0.078182,11000.0,0.0,36000.0,afegy,606519,1962-01-01
4,0.141766,17000.0,0.0,129000.0,afken,606519,1962-01-01


In [0]:
country_list = pd.read_csv('list_countries.csv',usecols=['id','name'],
                        na_values='',na_filter=True)
country_list.rename(columns=lambda x: x.replace('name','country'), inplace=True)
country_list.rename(columns=lambda x: x.replace('id','origin_id'), inplace=True)
trade_dframe = trade_dframe.merge(country_list, on=['origin_id'],how='left')

In [114]:
trade_dframe = trade_dframe.drop(['origin_id'],axis=1)
# trade_dframe = trade_dframe[trade_dframe['export_val'].notna()]
trade_dframe.isnull().sum()

export_rca    0
export_val    0
import_rca    0
import_val    0
sitc_id       0
year          0
country       0
dtype: int64

In [115]:
trade_dframe.head()

Unnamed: 0,export_rca,export_val,import_rca,import_val,sitc_id,year,country
0,0.020916,3000.0,0.0,24000.0,606519,1962-01-01,Cote d'Ivoire
1,0.025602,6000.0,0.0,21000.0,606519,1962-01-01,Democratic Republic of the Congo
2,0.001952,1000.0,0.0,207000.0,606519,1962-01-01,Algeria
3,0.078182,11000.0,0.0,36000.0,606519,1962-01-01,Egypt
4,0.141766,17000.0,0.0,129000.0,606519,1962-01-01,Kenya


# Extract comprehensive features

### Packages

In [116]:
!pip install tsfresh 
import os
import logging
import warnings

import pandas as pd
import matplotlib.pyplot as plt
from pandas_datareader.data import DataReader as pdr
import fix_yahoo_finance as yf
yf.pdr_override()

from tsfresh import extract_features, select_features
from tsfresh import defaults
from tsfresh.feature_extraction import feature_calculators
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
from tsfresh.utilities import dataframe_functions, profiling
from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor,DistributorBaseClass
from tsfresh.utilities.string_manipulation import convert_to_output_format

import tsfresh
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series



### Extract features

In [117]:
trade_dframe.head()

Unnamed: 0,export_rca,export_val,import_rca,import_val,sitc_id,year,country
0,0.020916,3000.0,0.0,24000.0,606519,1962-01-01,Cote d'Ivoire
1,0.025602,6000.0,0.0,21000.0,606519,1962-01-01,Democratic Republic of the Congo
2,0.001952,1000.0,0.0,207000.0,606519,1962-01-01,Algeria
3,0.078182,11000.0,0.0,36000.0,606519,1962-01-01,Egypt
4,0.141766,17000.0,0.0,129000.0,606519,1962-01-01,Kenya


In [118]:
extracted_features = extract_features(
    trade_dframe[['export_rca',"export_val","import_rca","import_val","year","country"]], 
    column_id="country", 
    column_sort="year",
    column_value=None,column_kind=None,
    chunksize=None,
    default_fc_parameters=ComprehensiveFCParameters(),
    # default_fc_parameters=EfficientFCParameters(),
    )
extracted_features.to_csv("tfresh_comprehensive_nositcnames.csv")
feature_names = extracted_features.columns


Feature Extraction:   0%|          | 0/5 [00:00<?, ?it/s][A
Feature Extraction:  20%|██        | 1/5 [00:53<03:34, 53.53s/it][A
Feature Extraction:  40%|████      | 2/5 [01:46<02:39, 53.26s/it][A
Feature Extraction:  60%|██████    | 3/5 [02:45<01:50, 55.03s/it][A
Feature Extraction:  80%|████████  | 4/5 [03:36<00:53, 53.80s/it][A
Feature Extraction: 100%|██████████| 5/5 [04:32<00:00, 54.54s/it]


In [0]:
extracted_features = extract_features(
    trade_dframe[['export_rca',"export_val","import_rca","import_val","year","country"]], 
    column_id="country", 
    column_sort="year",
    column_value=None,column_kind=None,
    chunksize=None,
    default_fc_parameters=EfficientFCParameters(),
    )
extracted_features.to_csv("tfresh_efficientfc_nositcnames.csv")
feature_names_efc = extracted_features.columns


Feature Extraction:   0%|          | 0/5 [00:00<?, ?it/s][A
Feature Extraction:  20%|██        | 1/5 [00:23<01:35, 23.87s/it][A
Feature Extraction:  40%|████      | 2/5 [00:46<01:10, 23.59s/it][A
Feature Extraction:  60%|██████    | 3/5 [01:11<00:47, 24.00s/it][A