In [1]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from pyramid.arima import auto_arima

from Sloth.Sloth import Sloth
Sloth = Sloth()

    The 'pyramid' package will be migrating to a new namespace beginning in 
    version 1.0.0: 'pmdarima'. This is due to a package name collision with the
    Pyramid web framework. For more information, see Issue #34:
    
        https://github.com/tgsmith61591/pyramid/issues/34
        
    The package will subsequently be installable via the name 'pmdarima'; the
    only functional change to the user will be the import name. All imports
    from 'pyramid' will change to 'pmdarima'.
    
Using TensorFlow backend.


In [2]:
# read and preprocess data
data_index = pd.read_csv("datasets/index.csv")
data_index['start_ind'][0] = 0
data = pd.read_csv("datasets/data.csv")

# bin tweets by hour
tweet_index = 0
s_per_hr = 3600
s_per_min = 60
s_per_10_min = 360
train_splits = [x / 100 for x in range(1,11,1)]
retweets = data['relative_time_second'][data_index['start_ind'][tweet_index] + 1:data_index['end_ind'][tweet_index]]
hourly = pd.cut(retweets, range(int(retweets.values[0]), int(retweets.values[len(retweets.values) - 1]) + s_per_hr,s_per_hr))
bin_hourly = retweets.groupby(hourly).agg('count')
#bin_hourly = Sloth.ScaleSeriesMinMax(bin_hourly, -1, 1)[0]
bin_hourly = pd.DataFrame(data = bin_hourly.values, index = range(len(bin_hourly)))
print(bin_hourly.shape)
bin_hourly.columns = ['tweet_count']
print(bin_hourly.head())

FileNotFoundError: File b'datasets/index.csv' does not exist

In [3]:
# function that plots time series data
# parameters:
#     input_data:                input data frame to plot
#     title:                     string to represent time series data, to be used as title / ylabel

def plot_data(input_data, title):
    plt.figure()
    plt.subplot(1, 1, 1)
    plt.plot(input_data.index, input_data.values, "k-")
    plt.xlabel("Year")
    plt.ylabel(title)
    plt.title(title)
 
# function that plots time series seasonal decomposition
# parameters:
#     input_data:                input data frame 
#     frequency:                 frequency, or periodicity, of the time series
def plot_seasonal(input_data, *frequency):
    if not frequency:
        result = Sloth.DecomposeSeriesSeasonal(input_data.index, input_data.values)
    else:
        result = Sloth.DecomposeSeriesSeasonal(input_data.index, input_data.values, frequency[0])
    fig = result.plot()
    plt.xlabel("Hour")
    plt.show()

# function that makes a future forecast for time series data
# parameters:
#     train:                     input training data frame
#     test:                      input testing data frame
#     seasonal:                  boolean; whether data has seasonal component
#     seasonal_differencing      period for seasonal differencing

def future_forecast(train, test, seasonal, *seasonal_differencing):
    
    print("DEBUG::the size of test is:")
    print(test.shape)
    
    '''
    if not seasonal_differencing:
        future_forecast = Sloth.PredictSeriesARIMA(train,test.shape[0],seasonal)
    else:
        future_forecast = Sloth.PredictSeriesARIMA(train,test.shape[0],seasonal, seasonal_differencing[0])
    '''
    if not seasonal_differencing:
            stepwise_model = auto_arima(train, start_p=1, start_q=1,
                            max_p=5, max_q=5, m=1,
                            start_P=1, start_Q = 1, seasonal=seasonal,
                            d=None, D=0, trace=True,
                            error_action='ignore',  
                            suppress_warnings=False, 
                            stepwise=True)
        # specified seasonal differencing parameter
    else:
        stepwise_model = auto_arima(train, start_p=1, start_q=1,
                            max_p=5, max_q=5, m=seasonal_differencing[0],
                            start_P=1, start_Q = 1, seasonal=seasonal,
                            d=None, D=0, trace=True,
                            error_action='ignore',  
                            suppress_warnings=False, 
                            stepwise=True)
    stepwise_model.fit(train)
    future_forecast = stepwise_model.predict(n_periods=test.shape[0])
    print("DEBUG::Future forecast:")
    print(future_forecast)

    future_forecast = pd.DataFrame(future_forecast,index = test.index, columns=["Prediction"])
    return future_forecast


# function that plots time series data
# parameters:
#     input_data:                input data frame to plot
#     future_forecast:           future forecast for time series data
#     title:                     string to represent time series data, to be used as title / ylabel
def plot_future_forecast(input_data, test, future_forecast, title):
    plt.subplot(2, 1, 1)
    plt.plot(pd.concat([test,future_forecast],axis=1).index, pd.concat([test,future_forecast],axis=1).values)
    plt.xlabel("data point index")
    plt.ylabel(title)
    plt.title(title)
    plt.subplot(2, 1, 2)
    plt.plot(pd.concat([input_data,future_forecast],axis=1).index, pd.concat([input_data,future_forecast],axis=1).values)
    plt.xlabel("Hour")
    plt.ylabel(title)

In [4]:
# plots and forecasting for electronic production
from scipy import stats
bin_hourly = retweets.groupby(hourly).agg('count')
#bin_hourly = Sloth.ScaleSeriesMinMax(bin_hourly, -1, 1)[0]
bin_hourly = pd.DataFrame(data = bin_hourly.values, index = range(len(bin_hourly)))
bin_hourly.columns = ['tweet_count']
#bin_hourly['tweet_count'] = [i + 1 for i in bin_hourly['tweet_count']]
#bin_hourly_t, lam = stats.boxcox(bin_hourly)
#bin_hourly = pd.DataFrame(data = bin_hourly_t, index = range(len(bin_hourly_t)))
#bin_hourly['tweet_count'] = [math.log(i) if i > 0 else 0 for i in bin_hourly['tweet_count']]
plot_data(bin_hourly, "Retweet Cascade")
plot_seasonal(bin_hourly)

# use 80% of data as training data
train_split = int(.5* len(bin_hourly))
train = bin_hourly[:train_split]
test = bin_hourly[train_split:]
future_forecast_tweets = future_forecast(train, test, False, 24)
plot_future_forecast(bin_hourly, test, future_forecast_tweets, 'Retweet Cascade')

NameError: name 'retweets' is not defined

In [None]:
# test simple exponential decay example
import math

X_train = [1000*0.5**i for i in range(10)]
X_test = [0 for i in range(10)]
data = X_train + X_test + X_test + X_test
#data = Sloth.ScaleSeriesMeanVariance(data)[0]
#data = [math.log(i) if i > 0 else 0 for i in data]
data = [i + 1 for i in data]
print(data)
data_t, lam = stats.boxcox(data)
print(data_t)
print(lam)
data = pd.DataFrame(data_t)
X_train = data[:29]
X_test = data[30:]
X_test.index = [i + len(X_train) for i in range(len(X_test))]

plot_data(data, "Exponential Decay Example")
plt.show()
plot_seasonal(data)

future_forecast_exp = future_forecast(X_train, X_test, False)
plot_future_forecast(data, X_test, future_forecast_exp, 'Exponential Decay Example')
