### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading and describing data

In [2]:
raw_sample_df = pd.read_csv('raw_sample.csv')

In [3]:
raw_sample_df.describe()

Unnamed: 0,user,time_stamp,adgroup_id,nonclk,clk
count,26557960.0,26557960.0,26557960.0,26557960.0,26557960.0
mean,568205.5,1494355000.0,513017.5,0.9485632,0.05143678
std,329750.2,198755.3,218378.2,0.2208869,0.2208869
min,1.0,1494000000.0,1.0,0.0,0.0
25%,282919.0,1494171000.0,360678.0,1.0,0.0
50%,564952.0,1494345000.0,563833.0,1.0,0.0
75%,852191.0,1494516000.0,686705.0,1.0,0.0
max,1141729.0,1494691000.0,846811.0,1.0,1.0


In [4]:
raw_sample_df.head()

Unnamed: 0,user,time_stamp,adgroup_id,pid,nonclk,clk
0,581738,1494137644,1,430548_1007,1,0
1,449818,1494638778,3,430548_1007,1,0
2,914836,1494650879,4,430548_1007,1,0
3,914836,1494651029,5,430548_1007,1,0
4,399907,1494302958,8,430548_1007,1,0


In [5]:
raw_sample_df['clicked'] = raw_sample_df['clk']
raw_sample_df.drop(['clk','nonclk'], axis=1, inplace=True)

In [6]:
raw_sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26557961 entries, 0 to 26557960
Data columns (total 5 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user        int64 
 1   time_stamp  int64 
 2   adgroup_id  int64 
 3   pid         object
 4   clicked     int64 
dtypes: int64(4), object(1)
memory usage: 1013.1+ MB


In [7]:
raw_sample_df['datetime'] = pd.to_datetime(raw_sample_df['time_stamp'], unit='s')
raw_sample_df.drop('time_stamp', axis=1, inplace=True)

In [8]:
raw_sample_df['hour'] = raw_sample_df['datetime'].dt.hour
raw_sample_df['weekday'] = raw_sample_df['datetime'].dt.weekday

In [9]:
raw_sample_df.head()

Unnamed: 0,user,adgroup_id,pid,clicked,datetime,hour,weekday
0,581738,1,430548_1007,0,2017-05-07 06:14:04,6,6
1,449818,3,430548_1007,0,2017-05-13 01:26:18,1,5
2,914836,4,430548_1007,0,2017-05-13 04:47:59,4,5
3,914836,5,430548_1007,0,2017-05-13 04:50:29,4,5
4,399907,8,430548_1007,0,2017-05-09 04:09:18,4,1


In [10]:
raw_sample_df.isnull().sum()

user          0
adgroup_id    0
pid           0
clicked       0
datetime      0
hour          0
weekday       0
dtype: int64

In [11]:
for col in raw_sample_df.select_dtypes(include=['int64']).columns:
    raw_sample_df[col] = pd.to_numeric(raw_sample_df[col], downcast='unsigned')

for col in raw_sample_df.select_dtypes(include=['float64']).columns:
    raw_sample_df[col] = pd.to_numeric(raw_sample_df[col], downcast='float')

raw_sample_df["pid"] = raw_sample_df["pid"].astype("category")

raw_sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26557961 entries, 0 to 26557960
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   user        uint32        
 1   adgroup_id  uint32        
 2   pid         category      
 3   clicked     uint8         
 4   datetime    datetime64[ns]
 5   hour        int32         
 6   weekday     int32         
dtypes: category(1), datetime64[ns](1), int32(2), uint32(2), uint8(1)
memory usage: 658.5 MB


In [12]:
import os
os.makedirs('preprocessed', exist_ok=True)
raw_sample_df.to_csv('preprocessed/raw_sample_final.csv', index=False)