In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.fftpack import fft
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.arima_model import ARIMA
import warnings

In [2]:
train = pd.read_csv('data/train_1.csv')
test = pd.read_csv('data/key_1.csv')

In [3]:
train.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,32.0,10.0,26.0,27.0,16.0,11.0,17.0,19.0,10.0,11.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,,,,,,,,,,...,48.0,9.0,25.0,13.0,3.0,11.0,27.0,13.0,36.0,10.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145063 entries, 0 to 145062
Columns: 551 entries, Page to 2016-12-31
dtypes: float64(550), object(1)
memory usage: 609.8+ MB


In [5]:
for col in train.drop("Page", axis=1).columns:
    train[col] = train[col].fillna(0)
    train[col] = train[col].astype(np.int32)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145063 entries, 0 to 145062
Columns: 551 entries, Page to 2016-12-31
dtypes: int32(550), object(1)
memory usage: 305.5+ MB


In [7]:
train = train.melt(id_vars=["Page"], var_name='date', value_name='Visits')

In [8]:
train.head()

Unnamed: 0,Page,date,Visits
0,2NE1_zh.wikipedia.org_all-access_spider,2015-07-01,18
1,2PM_zh.wikipedia.org_all-access_spider,2015-07-01,11
2,3C_zh.wikipedia.org_all-access_spider,2015-07-01,1
3,4minute_zh.wikipedia.org_all-access_spider,2015-07-01,35
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,2015-07-01,0


In [9]:
test.head()

Unnamed: 0,Page,Id
0,!vote_en.wikipedia.org_all-access_all-agents_2...,bf4edcf969af
1,!vote_en.wikipedia.org_all-access_all-agents_2...,929ed2bf52b9
2,!vote_en.wikipedia.org_all-access_all-agents_2...,ff29d0f51d5c
3,!vote_en.wikipedia.org_all-access_all-agents_2...,e98873359be6
4,!vote_en.wikipedia.org_all-access_all-agents_2...,fa012434263a


In [10]:
test['date'] = test["Page"].apply(lambda x: x[-10:])
test['Page'] = test["Page"].apply(lambda x: x[:-11])

In [11]:
test.head()

Unnamed: 0,Page,Id,date
0,!vote_en.wikipedia.org_all-access_all-agents,bf4edcf969af,2017-01-01
1,!vote_en.wikipedia.org_all-access_all-agents,929ed2bf52b9,2017-01-02
2,!vote_en.wikipedia.org_all-access_all-agents,ff29d0f51d5c,2017-01-03
3,!vote_en.wikipedia.org_all-access_all-agents,e98873359be6,2017-01-04
4,!vote_en.wikipedia.org_all-access_all-agents,fa012434263a,2017-01-05


In [12]:
def separate_page(df):
    df["Agent"] = df["Page"].apply(lambda x: x.split('_')[-1])
    df["Access"] = df["Page"].apply(lambda x: x.split('_')[-2])
    df["Web"] = df["Page"].apply(lambda x: x.split('_')[-3])
    df["Language"] = df["Web"].apply(lambda x: x.split('.')[0])
    df["Web"] = df["Web"].apply(lambda x: ".".join(x.split('.')[1:]))
    return df

In [13]:
train = separate_page(train)
test = separate_page(test)

In [14]:
train.head()

Unnamed: 0,Page,date,Visits,Agent,Access,Web,Language
0,2NE1_zh.wikipedia.org_all-access_spider,2015-07-01,18,spider,all-access,wikipedia.org,zh
1,2PM_zh.wikipedia.org_all-access_spider,2015-07-01,11,spider,all-access,wikipedia.org,zh
2,3C_zh.wikipedia.org_all-access_spider,2015-07-01,1,spider,all-access,wikipedia.org,zh
3,4minute_zh.wikipedia.org_all-access_spider,2015-07-01,35,spider,all-access,wikipedia.org,zh
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,2015-07-01,0,spider,all-access,wikipedia.org,zh


In [15]:
train["Agent"].value_counts()

Agent
all-agents    60582500
spider        19202150
Name: count, dtype: int64

In [16]:
train["Access"].value_counts()

Access
all-access    40873250
mobile-web    19766450
desktop       19144950
Name: count, dtype: int64

In [17]:
train["Web"].value_counts()

Web
wikipedia.org    69964400
wikimedia.org     5805250
mediawiki.org     4015000
Name: count, dtype: int64

In [18]:
train["Language"].value_counts()

Language
en         13259400
ja         11237050
de         10200850
fr          9791100
zh          9475950
ru          8262100
es          7737950
commons     5805250
www         4015000
Name: count, dtype: int64

In [19]:
train["Language"] = train["Language"].apply(lambda x: "na" if x in ("commons", "www") else x)

In [20]:
train = train.set_index(["Page", "date"])

In [21]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Visits,Agent,Access,Web,Language
Page,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2NE1_zh.wikipedia.org_all-access_spider,2015-07-01,18,spider,all-access,wikipedia.org,zh
2PM_zh.wikipedia.org_all-access_spider,2015-07-01,11,spider,all-access,wikipedia.org,zh
3C_zh.wikipedia.org_all-access_spider,2015-07-01,1,spider,all-access,wikipedia.org,zh
4minute_zh.wikipedia.org_all-access_spider,2015-07-01,35,spider,all-access,wikipedia.org,zh
52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,2015-07-01,0,spider,all-access,wikipedia.org,zh


In [22]:
lang_sets = {}
lang_sets['en'] = train[train["Language"]=='en'].iloc[:,0:-1]
lang_sets['ja'] = train[train["Language"]=='ja'].iloc[:,0:-1]
lang_sets['de'] = train[train["Language"]=='de'].iloc[:,0:-1]
lang_sets['na'] = train[train["Language"]=='na'].iloc[:,0:-1]
lang_sets['fr'] = train[train["Language"]=='fr'].iloc[:,0:-1]
lang_sets['zh'] = train[train["Language"]=='zh'].iloc[:,0:-1]
lang_sets['ru'] = train[train["Language"]=='ru'].iloc[:,0:-1]
lang_sets['es'] = train[train["Language"]=='es'].iloc[:,0:-1]
lang_sets['na'] = train[train["Language"]=='na'].iloc[:,0:-1]

sums = {}
for key in lang_sets:
    sums[key] = lang_sets[key].iloc[:,1:].sum(axis=0) / lang_sets[key].shape[0]

In [None]:
days = [r for r in range(sums['en'].shape[0])]

fig = plt.figure(1,figsize=[10,10])
plt.ylabel('Views per Page')
plt.xlabel('Day')
plt.title('Pages in Different Languages')
labels={'en':'English','ja':'Japanese','de':'German',
        'na':'Media','fr':'French','zh':'Chinese',
        'ru':'Russian','es':'Spanish'
       }

for key in sums:
    plt.plot(days,sums[key],label = labels[key] )
    
plt.legend()
plt.show()

In [None]:
def plot_with_fft(key):

    fig = plt.figure(1,figsize=[15,5])
    plt.ylabel('Views per Page')
    plt.xlabel('Day')
    plt.title(labels[key])
    plt.plot(days,sums[key],label = labels[key] )
    
    fig = plt.figure(2,figsize=[15,5])
    fft_complex = fft(sums[key])
    fft_mag = [np.sqrt(np.real(x)*np.real(x)+np.imag(x)*np.imag(x)) for x in fft_complex]
    fft_xvals = [day / days[-1] for day in days]
    npts = len(fft_xvals) // 2 + 1
    fft_mag = fft_mag[:npts]
    fft_xvals = fft_xvals[:npts]
        
    plt.ylabel('FFT Magnitude')
    plt.xlabel(r"Frequency [days]$^{-1}$")
    plt.title('Fourier Transform')
    plt.plot(fft_xvals[1:],fft_mag[1:],label = labels[key] )
    # Draw lines at 1, 1/2, and 1/3 week periods
    plt.axvline(x=1./7,color='red',alpha=0.3)
    plt.axvline(x=2./7,color='red',alpha=0.3)
    plt.axvline(x=3./7,color='red',alpha=0.3)

    plt.show()

for key in sums:
    plot_with_fft(key)

In [None]:
def plot_entry(key,idx):
    data = lang_sets[key].iloc[idx,1:]
    fig = plt.figure(1,figsize=(10,5))
    plt.plot(days,data)
    plt.xlabel('day')
    plt.ylabel('views')
    plt.title(train.iloc[lang_sets[key].index[idx],0])
    
    plt.show()

In [None]:
idx = [1, 5, 10, 50, 100, 250,500, 750,1000,1500,2000,3000,4000,5000]
for i in idx:
    plot_entry('en',i)

In [None]:
# For each language get highest few pages
npages = 5
top_pages = {}
for key in lang_sets:
    print(key)
    sum_set = pd.DataFrame(lang_sets[key][['Page']])
    sum_set['total'] = lang_sets[key].sum(axis=1)
    sum_set = sum_set.sort_values('total',ascending=False)
    print(sum_set.head(10))
    top_pages[key] = sum_set.index[0]
    print('\n\n')

In [None]:
for key in top_pages:
    fig = plt.figure(1,figsize=[10,5])
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    cols = train.columns[1:-1]
    data = np.array(train.loc[top_pages[key],cols])
    data_diff = [data[i] - data[i-1] for i in range(1,len(data))]
    autocorr = acf(data_diff)
    pac = pacf(data_diff)

    x = [x for x in range(len(pac))]
    ax1.plot(x[1:],autocorr[1:])

    ax2.plot(x[1:],pac[1:])
    ax1.set_xlabel('Lag')
    ax1.set_ylabel('Autocorrelation')
    ax1.set_title(train.loc[top_pages[key],'Page'])

    ax2.set_xlabel('Lag')
    ax2.set_ylabel('Partial Autocorrelation')
    plt.show()

In [None]:
cols = train.columns[1:-1]
for key in top_pages:
    data = np.array(train.loc[top_pages[key],cols],'f')
    result = None
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        try:
            arima = ARIMA(data,[2,1,4])
            result = arima.fit(disp=False)
        except:
            try:
                arima = ARIMA(data,[2,1,2])
                result = arima.fit(disp=False)
            except:
                print(train.loc[top_pages[key],'Page'])
                print('\tARIMA failed')
    #print(result.params)
    pred = result.predict(2,599,typ='levels')
    x = [i for i in range(600)]
    i=0

    plt.plot(x[2:len(data)],data[2:] ,label='Data')
    plt.plot(x[2:],pred,label='ARIMA Model')
    plt.title(train.loc[top_pages[key],'Page'])
    plt.xlabel('Days')
    plt.ylabel('Views')
    plt.legend()
    plt.show()

In [None]:
train = pd.concat([train.drop("Agent", axis=1), pd.get_dummies(train["Agent"], prefix="Agent")], axis=1)
train = pd.concat([train.drop("Access", axis=1), pd.get_dummies(train["Access"], prefix="Access")], axis=1)
train = pd.concat([train.drop("Web", axis=1), pd.get_dummies(train["Web"], prefix="Web")], axis=1)
train = pd.concat([train.drop("Language", axis=1), pd.get_dummies(train["Language"], prefix="Language")], axis=1)
test = pd.concat([test.drop("Agent", axis=1), pd.get_dummies(test["Agent"], prefix="Agent")], axis=1)
test = pd.concat([test.drop("Access", axis=1), pd.get_dummies(test["Access"], prefix="Access")], axis=1)
test = pd.concat([test.drop("Web", axis=1), pd.get_dummies(test["Web"], prefix="Web")], axis=1)
test = pd.concat([test.drop("Language", axis=1), pd.get_dummies(test["Language"], prefix="Language")], axis=1)

In [None]:
train = train.map({False: 0, True: 1})
test = test.map({False: 0, True: 1})

In [None]:
train.head()

In [None]:
def make_lags(df, num_lags, num_leads=1):
    for i in range(num_leads, num_lags + num_leads):
        df["lag_{}".format(i)] = df.groupby(["Page"])["Visits"].shift(i)
    return df

In [None]:
def make_steps(df, num_steps):
    for i in range(1, num_steps + 1):
        df["step_{}".format(i)] = df.groupby(["Page"])["Visits"].shift(-i)
    return df

In [None]:
train = make_lags(train, 7).dropna()
train = make_steps(train, 60).dropna()

In [None]:
train.head()

In [None]:
train