In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import json
from datetime import datetime, timedelta
from dateutil import relativedelta
import calendar
import numpy as np

In [3]:
dict_data = dict(json.loads(open('./data/data.json', 'r').read()))
df = pd.read_csv('./data/retrieved_data.csv')

In [14]:
df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)
df.set_index('Date', inplace=True)

In [None]:
df.index = [str(datetime.strptime(d, '%Y-%m-%d') + timedelta(1)).split()[0] for d in df.index]
df.index = pd.to_datetime(df.index)

In [None]:
df.head()

In [None]:
raw_prices = dict_data['prices']
for row in raw_prices:
    str_date = str(datetime.fromtimestamp(row['time'])).split()[0]
    if str_date in df.index:
        df.at[str_date, 'OilFuture'] = "%.2f" % row['price']
df = df[(df.index >= datetime(2013, 11, 17))]

In [None]:
df.head()

In [None]:
def get_last_week(month):
    first_day = datetime.strptime(str(month), '%Y%m')
    days_in_month = calendar.monthrange(first_day.year, first_day.month)[1]
    return (first_day + relativedelta.relativedelta(day=days_in_month, 
                                            weekday=relativedelta.MO(-1))).date()

In [None]:
def fill_i_e(ie_df):
    import_df = ie_df[ie_df['Trade Flow'] == 'Exports']
    export_df = ie_df[ie_df['Trade Flow'] == 'Imports']
    for month, group in import_df.groupby('Period'):
        last_week = get_last_week(month)
        if last_week.year == 2013 and last_week.month < 11:
            continue
        df.at[last_week, 'PPImport'] = "%.2f" % np.sum(group['Trade Value (US$)'])
    
    for month, group in export_df.groupby('Period'):
        last_week = get_last_week(month)
        if last_week.year == 2013 and last_week.month < 11:
            continue
        df.at[last_week, 'PPExport'] = "%.2f" % np.sum(group['Trade Value (US$)'])
    return df

In [None]:
filenames = "comtrade-{0}.csv comtrade-{1}.csv comtrade-{2}.csv \
            comtrade-{3}.csv comtrade-{4}.csv comtrade-{5}.csv ".format(3, 9, 10, 11, 12, 13).split()

for filename in filenames:
    df = fill_i_e(pd.read_csv('./data/' + filename))

In [None]:
df.head(20)

In [None]:
df.tail(20)

In [None]:
df.to_csv('./data/retrieved_data.csv')

In [15]:
df.head()

Unnamed: 0_level_0,PPSpotAvgPrice,OilAvgPrice,PPImport,PPExport,ExchangeRate,PPFuture,OilFuture
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-11-18,,,,,6.1351,,101.94
2013-11-25,,,375013373.0,24344624.0,6.138,,102.14
2013-12-02,,,,,6.1325,,103.53
2013-12-09,,,,,6.1232,,104.01
2013-12-16,,,,,6.1148,,101.57


In [23]:
sereja_df = pd.read_csv('./data/futures.csv')
sereja_df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)
sereja_df.set_index('Date', inplace=True)
sereja_df.head()

Unnamed: 0_level_0,PPSpotAvgPrice,OilAvgPrice,PPImport,PPExport,ExchangeRate,PPFuture,OilFuture
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(0, 2008-11-16)",,,,,6.8289,,
"(1, 2008-11-23)",,,,,6.8317,,
"(2, 2008-11-30)",,,,,6.8349,,
"(3, 2008-12-07)",,,,,6.8482,,
"(4, 2008-12-14)",,,,,6.8451,,


In [26]:
for i, row in df.iterrows():
    d = pd.to_datetime(i).date()
    try:
        df.at[d, 'PPFuture'] = sereja_df.at[d - timedelta(6), 'PPFuture']
    except:
        continue

In [27]:
df.head()

Unnamed: 0_level_0,PPSpotAvgPrice,OilAvgPrice,PPImport,PPExport,ExchangeRate,PPFuture,OilFuture
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-11-18,,,,,6.1351,,101.94
2013-11-25,,,375013373.0,24344624.0,6.138,,102.14
2013-12-02,,,,,6.1325,,103.53
2013-12-09,,,,,6.1232,,104.01
2013-12-16,,,,,6.1148,,101.57


In [30]:
slava_df = pd.read_csv('./data/test.csv')
slava_df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)
slava_df.set_index('Date', inplace=True)
slava_df.head()

Unnamed: 0.1,Unnamed: 0,Date,PPSpotAvgPrice,OilAvgPrice,PPImport,PPExport,ExchangeRate,PPFuture,OilFuture
0,0,2008-11-16,,,,,6.8289,,
1,1,2008-11-23,,,,,6.8317,,
2,2,2008-11-30,,,,,6.8349,,
3,3,2008-12-07,,,,,6.8482,,
4,4,2008-12-14,,,,,6.8451,,


In [None]:
for i, row in df.iterrows():
    d = pd.to_datetime(i).date()
    try:
        df.at[d, 'PPFuture'] = sereja_df.at[d - timedelta(6), 'PPFuture']
    except:
        continue