In [6]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook

In [7]:
def prepare_data(path, target):
    raw_df = pd.read_csv(path)
    
    site_names = ['site{}'.format(i) for i in range(1, 11)]
    time_names = ['time{}'.format(i) for i in range(1, 11)]
    feature_names = [None] * (2 * len(site_names))
    feature_names[::2] = time_names
    feature_names[1::2] = site_names
    
    # prepare 30-min steps
    raw_df['timestamp'] = pd.to_datetime(raw_df['timestamp'])
    time_diff = raw_df['timestamp'].diff().astype(int)
    time_diff = np.where(time_diff < 0, np.nan, time_diff)
    raw_df['min_diff'] = time_diff/(1e9*60)
    raw_df['min_diff'].fillna(0, inplace = True)
    raw_df['min_cumsum'] = raw_df['min_diff'].cumsum()
    raw_df['step'] = (raw_df['min_cumsum']//30).astype(int)
    
    step_list = raw_df['step'].unique()
    
    stacking_list = []
    for step in step_list:
        temp_part = raw_df[raw_df['step'] == step][['timestamp', 'site']].to_numpy()
        
        # infill matrix by NaN`s
        temp_padding = np.full((10 - temp_part.shape[0]%10, 2), np.nan)
        temp_part = np.vstack([temp_part, temp_padding])

        # https://stackoverflow.com/questions/3678869/pythonic-way-to-combine-two-lists-in-an-alternating-fashion
        temp_combine = [None] * (2 * len(temp_part))
        temp_combine[::2] = temp_part[:, 0]
        temp_combine[1::2] = temp_part[:, 1]

        temp_result = np.array(temp_combine).reshape((-1, 20))
        stacking_list.append(temp_result)

    data = np.vstack(stacking_list)
    
    df = pd.DataFrame(data, columns = feature_names)
    df['target'] = np.full(df.shape[0], target)
    
    return df  

In [8]:
alice_path = 'Data/raw_train/Alice_log.csv'

In [9]:
%%time
alice_df = prepare_data(alice_path, target = 1)

CPU times: user 219 ms, sys: 5.41 ms, total: 224 ms
Wall time: 223 ms


In [10]:
alice_df.head()

Unnamed: 0,time1,site1,time2,site2,time3,site3,time4,site4,time5,site5,...,site6,time7,site7,time8,site8,time9,site9,time10,site10,target
0,2013-02-12 16:25:10,api.bing.com,2013-02-12 16:25:11,api.bing.com,2013-02-12 16:32:10,api.bing.com,2013-02-12 16:32:11,www.google.fr,2013-02-12 16:32:24,www.google.fr,...,www.info-jeunes.net,2013-02-12 16:32:25,www.google.fr,2013-02-12 16:32:26,www.info-jeunes.net,2013-02-12 16:32:27,platform.twitter.com,2013-02-12 16:32:27,www.info-jeunes.net,1
1,2013-02-12 16:32:27,www.facebook.com,2013-02-12 16:32:28,www.info-jeunes.net,2013-02-12 16:32:29,twitter.com,2013-02-12 16:32:34,www.info-jeunes.net,2013-02-12 16:32:35,www.info-jeunes.net,...,www.facebook.com,2013-02-12 16:32:42,www.info-jeunes.net,2013-02-12 16:32:42,www.facebook.com,2013-02-12 16:32:51,www.info-jeunes.net,2013-02-12 16:32:53,www.info-jeunes.net,1
2,2013-02-12 16:32:53,www.facebook.com,2013-02-12 16:33:11,www.info-jeunes.net,2013-02-12 16:33:12,www.info-jeunes.net,2013-02-12 16:33:13,www.facebook.com,2013-02-12 16:33:15,twitter.com,...,www.info-jeunes.net,2013-02-12 16:33:24,www.facebook.com,2013-02-12 16:33:33,www.info-jeunes.net,2013-02-12 16:33:34,www.facebook.com,2013-02-12 16:33:46,api.bing.com,1
3,2013-02-12 16:33:50,www.bing.com,2013-02-12 16:33:51,www.bing.com,2013-02-12 16:33:52,www.leboncoin.fr,2013-02-12 16:33:52,www.bing.com,2013-02-12 16:33:52,twitter.com,...,static.leboncoin.fr,2013-02-12 16:33:52,deliv.leboncoin.fr,2013-02-12 16:33:53,www.leboncoin.fr,2013-02-12 16:33:53,deliv.leboncoin.fr,2013-02-12 16:33:53,static.leboncoin.fr,1
4,2013-02-12 16:33:55,193.164.197.30,2013-02-12 16:33:55,www.leboncoin.fr,2013-02-12 16:33:55,static.leboncoin.fr,2013-02-12 16:33:55,193.164.196.60,2013-02-12 16:33:55,193.164.197.40,...,193.164.196.30,2013-02-12 16:33:55,193.164.196.40,2013-02-12 16:33:55,193.164.197.60,2013-02-12 16:33:55,193.164.197.50,2013-02-12 16:33:55,deliv.leboncoin.fr,1


In [16]:
alice_df.shape

(2334, 21)

In [11]:
other_user_path = 'Data/raw_train/other_user_logs'

In [14]:
files_list = sorted([file for file in os.listdir(other_user_path) if 'csv' in file])
other_users_df = pd.DataFrame(columns = alice_df.columns)

for file_name in tqdm_notebook(files_list):
    temp_df = prepare_data(os.path.join(other_user_path, file_name), target = 0)
    other_users_df = pd.concat([other_users_df, temp_df])
    
other_users_df.reset_index(drop = True, inplace = True)
    

HBox(children=(IntProgress(value=0, max=1557), HTML(value='')))




In [17]:
other_users_df.head()

Unnamed: 0,time1,site1,time2,site2,time3,site3,time4,site4,time5,site5,...,site6,time7,site7,time8,site8,time9,site9,time10,site10,target
0,2013-11-29 08:14:18,fpdownload2.macromedia.com,2013-11-29 08:14:26,hotmail.fr,2013-11-29 08:14:38,login.live.com,2013-11-29 08:14:57,login.live.com,2013-11-29 08:15:17,login.live.com,...,login.live.com,2013-11-29 08:15:23,mail.live.com,2013-11-29 08:15:29,dub122.mail.live.com,2013-11-29 08:15:30,people.directory.live.com,2013-11-29 08:15:30,dub122.mail.live.com,0
1,2013-11-29 08:15:34,dub122.mail.live.com,2013-11-29 08:15:35,secure.shared.live.com,2013-11-29 08:15:36,windowslive.tt.omtrdc.net,2013-11-29 08:15:36,dub122.mail.live.com,2013-11-29 08:15:37,secure.shared.live.com,...,cid-1bed360223325286.users.storage.live.com,2013-11-29 08:15:37,js.live.net,2013-11-29 08:15:37,people.directory.live.com,2013-11-29 08:15:38,login.live.com,2013-11-29 08:15:39,go.trouter.io,0
2,2013-11-29 08:15:39,storage.live.com,2013-11-29 08:15:39,blufiles.storage.msn.com,2013-11-29 08:15:39,h.live.com,2013-11-29 08:15:39,windowslive.tt.omtrdc.net,2013-11-29 08:15:40,prod.registrar.skype.com,...,api.skype.com,2013-11-29 08:15:41,h.live.com,2013-11-29 08:15:42,h.live.com,2013-11-29 08:15:49,secure.shared.live.com,2013-11-29 08:16:47,h.live.com,0
3,2013-11-29 08:17:32,h.live.com,2013-11-29 08:17:36,dub122.mail.live.com,2013-11-29 08:18:26,dub122.mail.live.com,2013-11-29 08:18:31,cid-1bed360223325286.users.storage.live.com,2013-11-29 08:21:45,proxy-bay-people.directory.live.com,...,p.sfx.ms,2013-11-29 08:40:48,proxy-bay-people.directory.live.com,2013-11-29 08:40:49,dub122.mail.live.com,2013-11-29 08:40:50,h.live.com,NaT,,0
4,2013-11-29 08:45:53,proxy-bay-people.directory.live.com,2013-11-29 08:46:22,dub122.mail.live.com,2013-11-29 08:47:26,h.live.com,2013-11-29 08:47:27,dub122.mail.live.com,2013-11-29 08:50:57,proxy-bay-people.directory.live.com,...,proxy-bay-people.directory.live.com,2013-11-29 09:00:19,prod.registrar.skype.com,2013-11-29 09:00:57,h.live.com,2013-11-29 09:00:58,prod.registrar.skype.com,2013-11-29 09:01:09,h.live.com,0


In [15]:
other_users_df.shape

(264824, 21)

In [18]:
result_df = pd.concat([alice_df, other_users_df]).sort_values('time1')
result_df.head()

Unnamed: 0,time1,site1,time2,site2,time3,site3,time4,site4,time5,site5,...,site6,time7,site7,time8,site8,time9,site9,time10,site10,target
126002,2013-01-12 08:05:57,safebrowsing.clients.google.com,2013-01-12 08:05:57,safebrowsing-cache.google.com,NaT,,NaT,,NaT,,...,,NaT,,NaT,,NaT,,NaT,,0
126003,2013-01-12 08:37:23,safebrowsing.clients.google.com,2013-01-12 08:37:23,safebrowsing-cache.google.com,NaT,,NaT,,NaT,,...,,NaT,,NaT,,NaT,,NaT,,0
234644,2013-01-12 08:50:13,www.apache.org,2013-01-12 08:50:14,www.apache.org,2013-01-12 08:50:15,download.eclipse.org,2013-01-12 08:50:15,www.apache.org,2013-01-12 08:50:16,www.apache.org,...,www.webtide.com,2013-01-12 08:50:16,download.oracle.com,2013-01-12 08:50:16,javadl-esd-secure.oracle.com,2013-01-12 08:50:17,www.caucho.com,2013-01-12 08:50:17,www.apache.org,0
234645,2013-01-12 08:50:17,www.webtide.com,2013-01-12 08:50:17,download.oracle.com,2013-01-12 08:50:18,www.caucho.com,2013-01-12 08:50:18,download.oracle.com,2013-01-12 08:50:18,www.webtide.com,...,www.apache.org,2013-01-12 08:50:19,public.dhe.ibm.com,2013-01-12 08:50:19,www.webtide.com,2013-01-12 08:50:19,www.apache.org,2013-01-12 08:50:20,www.apache.org,0
234646,2013-01-12 08:50:20,public.dhe.ibm.com,2013-01-12 08:50:20,jope.ow2.org,2013-01-12 08:50:20,download.oracle.com,2013-01-12 08:50:21,public.dhe.ibm.com,2013-01-12 08:50:21,jope.ow2.org,...,master.dl.sourceforge.net,2013-01-12 08:50:21,www.apache.org,2013-01-12 08:50:22,download.eclipse.org,2013-01-12 08:50:22,www.apache.org,2013-01-12 08:50:22,public.dhe.ibm.com,0


In [22]:
result_df.reset_index(drop = True).to_csv('Data/additional_train_data.scv')