In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10000)

In [2]:
df_flat = pd.read_csv('input/train_2.csv')
df_flat = df_flat.melt(id_vars='Page', var_name='date', value_name='Visits')

In [3]:
df_flat['date'] =  pd.to_datetime(df_flat['date'])
first_day = df_flat['date'].max() - pd.Timedelta(days=356+62-1)

In [4]:

df_flat = df_flat[df_flat['date'] >= first_day]

In [4]:
print(df_flat['date'].min())
print(df_flat['date'].max())
print(df_flat.shape)

2015-07-01 00:00:00
2017-09-10 00:00:00
(116485589, 3)


In [6]:
def remove_leading_zero_Visits(df):    
    df['visited'] = df['Visits'].notna().apply(lambda x: 1 if x else 0)
    df['cum_count'] = df.groupby(['Page'])['visited'].cumsum()
    df = df.drop(columns=['visited'])
    df = df.drop(df[df['cum_count']==0].index)
    return df.drop(columns=['cum_count'])

# df_flat = remove_leading_zero_Visits(df_flat)

def get_page(s):
    return re.sub('[0-9!"\?\.)(,\+\*\[\]/:;\-\'&_]', ' ', s[:s[:s.rfind('.')].rfind('.')])

def get_lang(s):
    page = get_page(s)
    return page[page.rfind('_')-1:]


In [7]:
max_day = df_flat['date'].max()
# days_range = [max_day - pd.Timedelta(days=i) for i in range(356+62)]
days_range = pd.date_range(start='1/07/2015', end='10/09/2017')
grid = pd.DataFrame(df_flat['Page'].unique().reshape(-1,1), columns=['Page'])

# grid['Page'] = grid['Page'].str.replace('_www', '')
grid['lang'] = grid['Page'].apply(get_lang)
print(grid['lang'].unique())
grid.loc[grid['lang'] == 'ww', 'lang'] = 'en'
grid['lang'] = LabelEncoder().fit_transform(grid['lang'])

['zh' 'fr' 'en' 'ns' 'ru' 'ww' 'de' 'ja' 'es']


In [None]:
grid = grid.merge(pd.DataFrame(days_range, columns=['date']), how='cross')
df_flat = grid.merge(df_flat, on=['Page', 'date'], how='left')
df_flat['day_of_week'] = df_flat['date'].dt.dayofweek
del grid
df_flat

In [9]:
df_flat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60636334 entries, 0 to 60636333
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   Page         object        
 1   lang         int32         
 2   date         datetime64[ns]
 3   Visits       float64       
 4   day_of_week  int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(1)
memory usage: 2.5+ GB


In [10]:

def get_encoders():
    df = pd.read_csv('input/train_2.csv')['Page']
    split = df.str.split('.')
    
    return LabelEncoder().fit(split.str[-1]),\
           LabelEncoder().fit(split.str[-2]),\
            CountVectorizer(ngram_range=(1, 1), max_features=100).fit(df.apply(get_page))
#            TfidfVectorizer(max_features=20, use_idf=False, binary=True, norm=False)\
#                 .fit(df.apply(get_page))

In [11]:
df_flat_train = df_flat[df_flat['date'] <= (df_flat['date'].max() - pd.Timedelta(days=62))]
df_flat_test = df_flat[df_flat['date'] > (df_flat['date'].max() - pd.Timedelta(days=62))]
#df_flat_train = remove_leading_zero_Visits(df_flat_train)
#df_flat_test = remove_leading_zero_Visits(df_flat_test)

lb = LabelEncoder().fit(df_flat_train['Page'])

df_flat_train['Page_cat'] = lb.transform(df_flat_train['Page'])
df_flat_test['Page_cat'] = lb.transform(df_flat_test['Page'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [11]:
e1,e2,e3 = get_encoders()

In [13]:


del df_flat

In [12]:
def to_jlines_tf(df):
    def to_cats(row):
        #print(row)
        page = row.values[0]
        split = page.split('.')
        row['cat'] = np.append([
                          row.values[2],#lang
                          #  row.values[1],#Page_cat
                          e1.transform([split[-1]]),
                          e2.transform([split[-2]])],[]
#                         e3.transform([get_page(page)] unigrams
                                    ).astype(int)
        return row
    df = df.sort_values(by=['date'])
    df['Visits'] = df['Visits'].fillna('NaN')
    print("df = df.groupby(['Page'], as_index=False)")
    df = df.groupby(['Page', 'Page_cat', 'lang'], as_index=False)\
        .agg({'date':np.min, 
              'Visits':lambda x: x.tolist(),
              'day_of_week':lambda days: [[d for d in days]]})
    df['dynamic_feat'] = df['day_of_week']
    
    print(df.shape)
    import time
    start = time.time()
    df = df.apply(to_cats, axis='columns')
    print(time.time()-start)
    df['target'] = df['Visits']
    df['start'] = df['date']
    
    def f(row):
        while (len(row['target'])>0 and row['target'][0] == "NaN"):
            row['target'].pop(0)
            row['dynamic_feat'][0].pop(0)
            row['start'] = row['start'] + pd.Timedelta(days=1)
        return row

    df.apply(f, axis=1)
    df['start'] = df['start'].dt.strftime('%Y-%m-%d %M:%M:%M')
    df = df.drop(columns=['date', 'Visits', 
                          'Page', 
                          'Page_cat',
                         'day_of_week',
                         'lang'])
    return df.sample(frac=1)


In [15]:
df_flat['Page_cat'] = LabelEncoder().fit_transform(df_flat['Page'])
with open('./data/train_full.json', 'w') as f:
    f.write(to_jlines_tf(df_flat).to_json(orient='records', lines=True))

df = df.groupby(['Page'], as_index=False)
(145063, 7)


  return array(a, dtype, copy=False, order=order, subok=True)


121.50325870513916


In [15]:

with open('./data/test_1y.1.json', 'w') as f:
    f.write(to_jlines_tf(df_flat_test).to_json(orient='records', lines=True))


df = df.groupby(['Page'], as_index=False)
(145063, 7)


  return array(a, dtype, copy=False, order=order, subok=True)


109.66551971435547


In [16]:
with open('data/train_1y.1.json', 'w') as f:
    f.write(to_jlines_tf(df_flat_train).to_json(orient='records', lines=True))

df = df.groupby(['Page'], as_index=False)
(145063, 7)
125.64628887176514


In [None]:
# !aws s3 cp data s3://wiki-ts/ --recursive

In [None]:
df_flat_train['Page'].unique().shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from jupyterthemes import jtplot

# jtplot.style()



sns.set_style("dark")
for page in df_flat_test['Page'].unique()[:15]:
    plt.figure(figsize=(16,6))
    col = df_flat_test[df_flat_test['Page']==page]['Visits']
    max_train_date = df_flat_train[df_flat_train['Page']==page]['date'].max()
    mean = df_flat_train[(df_flat_train['date'] == max_train_date)\
                             & (df_flat_train['Page']==page) ]['Visits_mean'].iloc[0]
    data = pd.concat([col], axis=1)
    data.columns=['Visits']
    data['mean'] = mean
    sns.lineplot(data=data.reset_index(drop=True)).set_title(page)
    plt.show()
    print(mean)

In [None]:
d = pd.DataFrame({'a':[1, 1, 1, 2, 2], 'b':[1,2,3,np.nan, np.nan]})
d.groupby('a', as_index=False).agg({'b':lambda x: x.tolist()})

In [5]:
pd.DataFrame(pd.date_range(start='1/07/2015', end='10/09/2017'), columns=['Page'])

Unnamed: 0,Page
0,2015-01-07
1,2015-01-08
2,2015-01-09
3,2015-01-10
4,2015-01-11
5,2015-01-12
6,2015-01-13
7,2015-01-14
8,2015-01-15
9,2015-01-16
