In [1]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
from scipy.stats import pearsonr
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline
init_notebook_mode(connected = True)

In [2]:
df = pd.read_csv('./data/retrieved_data.csv', index_col='Date')
df.index = pd.to_datetime(df.index)
for col in df.columns:
    df[col].interpolate(method='time', inplace=True)
df.head()

Unnamed: 0_level_0,PPSpotAvgPrice,OilAvgPrice,PPImport,PPExport,ExchangeRate,PPFuture,OilFuture
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-11-18,12016.67,4565.0,,,6.1351,,101.94
2013-11-25,12050.0,4562.5,375013373.0,24344624.0,6.138,,102.14
2013-12-02,12158.33,4567.5,384366709.8,24078257.2,6.1325,,103.53
2013-12-09,12391.67,4580.0,393720046.6,23811890.4,6.1232,,104.01
2013-12-16,12391.67,4582.5,403073383.4,23545523.6,6.1148,,101.57


In [3]:
def plotly_line(series, title = ''):
    trace = go.Scatter(
            x = series.index,
            y = series,
            mode = 'lines',
            name = series.name
    )

    layout = dict(title = title)
    fig = dict(data = [trace], layout = layout)
    iplot(fig, show_link=False)

def plotly_series(series_arr, title=''):
    traces = []
    for series in series_arr:
        trace = go.Scatter(
                x = series.index,
                y = series,
                mode = 'lines',
                name = series.name
        )
        traces.append(trace)
    layout = dict(title = title)
    fig = dict(data = traces, layout = layout)
    iplot(fig, show_link=False)
    
def plotly_df(df, title=''):
    traces = []
    for col in df.columns:
        trace = go.Scatter(
                x = df.index,
                y = df[col],
                mode = 'lines',
                name = col
        )
        traces.append(trace)
    layout = dict(title = title)
    fig = dict(data = traces, layout = layout)
    iplot(fig, show_link=False)

In [4]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure(figsize=(10, 8))
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Feature Correlation')
    labels=[df.index.name] + list(df.columns)
    ax1.set_xticklabels(labels,fontsize=12, rotation=30)
    ax1.set_yticklabels(labels,fontsize=12)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax)
    #plt.show()
    plt.savefig('corr_plot.png')

In [5]:
plotly_df(df[df.columns[:2]], 'Average Price')
plotly_df(df[df.columns[2:4]], 'PP Import/Export')
plotly_line(df[df.columns[4]], 'USD/CNY')
plotly_df(df[df.columns[5:]], 'Futures')

plotly_line(df[df.columns[1]], 'Oil Average Price')
plotly_line(df[df.columns[6]], 'Oil Future')

In [6]:
def calc_df_pvalue(df):
    df1 = df.copy()
    df2 = df.copy()

    coeffmat = np.zeros((df1.shape[1], df2.shape[1]))
    pvalmat = np.zeros((df1.shape[1], df2.shape[1]))

    for i in range(df1.shape[1]):    
        for j in range(df2.shape[1]):        
            corrtest = pearsonr(df1[df1.columns[i]], df2[df2.columns[j]])  
            pvalmat[i,j] = corrtest[1]

    return pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns)
pval_df = calc_df_pvalue(df)
pval_df.to_excel('./data/pvalue_data.xlsx')