In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('data/train_1.csv')
test = pd.read_csv('data/key_1.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
for col in train.drop("Page", axis=1).columns:
    train[col] = train[col].fillna(0)
    train[col] = train[col].astype(np.int32)

In [None]:
train.info()

In [None]:
train = train.melt(id_vars=["Page"], var_name='date', value_name='Visits')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test['date'] = test["Page"].apply(lambda x: x[-10:])
test['Page'] = test["Page"].apply(lambda x: x[:-11])

In [None]:
test.head()

In [None]:
def separate_page(df):
    df["Agent"] = df["Page"].apply(lambda x: x.split('_')[-1])
    df["Access"] = df["Page"].apply(lambda x: x.split('_')[-2])
    df["Web"] = df["Page"].apply(lambda x: x.split('_')[-3])
    df["Language"] = df["Web"].apply(lambda x: x.split('.')[0])
    df["Web"] = df["Web"].apply(lambda x: ".".join(x.split('.')[1:]))
    return df

In [None]:
train = separate_page(train)
test = separate_page(test)

In [None]:
train.head()

In [None]:
train["Agent"].value_counts()

In [None]:
train["Access"].value_counts()

In [None]:
train["Web"].value_counts()

In [None]:
train["Language"].value_counts()

In [None]:
train["Language"] = train["Language"].apply(lambda x: "na" if x in ("commons", "www") else x)

In [None]:
train = train.set_index(["Page", "date"])

In [None]:
train.head()

In [None]:
train = pd.concat([train.drop("Agent", axis=1), pd.get_dummies(train["Agent"], prefix="Agent")], axis=1)
train = pd.concat([train.drop("Access", axis=1), pd.get_dummies(train["Access"], prefix="Access")], axis=1)
train = pd.concat([train.drop("Web", axis=1), pd.get_dummies(train["Web"], prefix="Web")], axis=1)
train = pd.concat([train.drop("Language", axis=1), pd.get_dummies(train["Language"], prefix="Language")], axis=1)
test = pd.concat([test.drop("Agent", axis=1), pd.get_dummies(test["Agent"], prefix="Agent")], axis=1)
test = pd.concat([test.drop("Access", axis=1), pd.get_dummies(test["Access"], prefix="Access")], axis=1)
test = pd.concat([test.drop("Web", axis=1), pd.get_dummies(test["Web"], prefix="Web")], axis=1)
test = pd.concat([test.drop("Language", axis=1), pd.get_dummies(test["Language"], prefix="Language")], axis=1)

In [None]:
train = train.map({False: 0, True: 1})
test = test.map({False: 0, True: 1})

In [None]:
train.head()

In [None]:
def make_lags(df, num_lags, num_leads=1):
    for i in range(num_leads, num_lags + num_leads):
        df["lag_{}".format(i)] = df.groupby(["Page"])["Visits"].shift(i)
    return df

In [None]:
def make_steps(df, num_steps):
    for i in range(1, num_steps + 1):
        df["step_{}".format(i)] = df.groupby(["Page"])["Visits"].shift(-i)
    return df

In [None]:
train = make_lags(train, 7).dropna()
train = make_steps(train, 60).dropna()

In [None]:
train.head()

In [None]:
train