# Prepare data for prediction algorithm 
## 1. Preprocessing Train Data

In [1]:
import pandas as pd
from matplotlib import pyplot
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
%matplotlib inline

In [2]:
#Load data
df_train = pd.read_csv("train.csv")

# Save dataframe 
df_train.to_pickle('df_train')

df_train.head()

Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,...,slotheight,slotvisibility,slotformat,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,...,200,2,0,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,NaN
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,...,250,FourthView,Na,0,10722,294,23,,2821,NaN
2,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,41,2.0,...,250,2,0,5,798b2d49952d77f1eace9f23c210d0b5,238,24,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110
3,0,6,23,0348beeae93e561584c3b50fc9e7746a33048ad7,0d6eaf2259699990e38a1fc5116f112070b9ecdc,windows_ie,114.250.226.*,1,1,1.0,...,600,2,1,0,cb7c76e7784031272e37af8e7e9b062c,300,25,bebefa5efe83beee17a3d245e7c5085b,1458,138661006310111
4,0,5,6,268149c1789bce2bc9798ffd97ec431219bafeb3,a239d9bb642460d974ba67f85e63b8d3e214da0e,windows_ie,183.63.192.*,216,233,2.0,...,90,OtherView,Na,133,7330,277,133,,2259,NaN


### Check for Null - Nan values in rows 

In [3]:
# Load saved dataframe
df_train_raw = pd.read_pickle('df_train')

# urlid only have NaN for all rows  
# bidid, userid are not needed for prediction -> drop these cols 

df_train_raw = df_train_raw.drop(['urlid', 'bidid', 'userid'], axis = 1)

In [4]:
# Check for null in df 

df_train_cols = list(df_train_raw)
for col_name in df_train_cols:
    print(col_name, df_train_raw[col_name].isnull().values.any())

click False
weekday False
hour False
useragent False
IP False
region False
city False
adexchange True
domain True
url True
slotid False
slotwidth False
slotheight False
slotvisibility False
slotformat False
slotprice False
creative False
bidprice False
payprice False
keypage True
advertiser False
usertag True


In [5]:
# Fill null as 'unknown' 
df_train_raw['url'].fillna('unknown', inplace = True)
df_train_raw['keypage'].fillna('unknown', inplace = True)
df_train_raw['usertag'].fillna('unknown', inplace = True)

# Convert adexchange data type to int 
df_train_raw['adexchange'].fillna(0 , inplace = True )
df_train_raw.adexchange =  df_train_raw.adexchange.astype(int)

# Convert domain data type 
df_train_raw.domain = df_train_raw.domain.astype(str)

# Check for null in df again
col_names = list(df_train_raw)
for column in col_names:
    print(column, df_train_raw[column].isnull().values.any())


click False
weekday False
hour False
useragent False
IP False
region False
city False
adexchange False
domain False
url False
slotid False
slotwidth False
slotheight False
slotvisibility False
slotformat False
slotprice False
creative False
bidprice False
payprice False
keypage False
advertiser False
usertag False


### Encoded catergories data into numerical to use in sklearn libraries 

In [6]:
lb_encode = LabelEncoder()

train_cat_cols = ['useragent','IP','url','domain','slotid','slotvisibility','slotformat','creative','keypage','usertag']
for cat_name in train_cat_cols:
    print('endcoding', cat_name, '.....')
    df_train_raw[cat_name + '_encoded'] = lb_encode.fit_transform(df_train_raw[cat_name])

print()
print('Encoding done!')

endcoding useragent .....
endcoding IP .....
endcoding url .....
endcoding domain .....
endcoding slotid .....
endcoding slotvisibility .....
endcoding slotformat .....
endcoding creative .....
endcoding keypage .....
endcoding usertag .....

Encoding done!


In [10]:
df_train_prepared = df_train_raw.drop(columns = ['useragent','IP','domain','url','slotid','slotvisibility','slotformat','creative','keypage','usertag'])

# Save prepared df 
df_train_prepared.to_pickle('df_train_prepared_updated')

df_train_prepared.head()

Unnamed: 0,click,weekday,hour,region,city,adexchange,slotwidth,slotheight,slotprice,bidprice,...,useragent_encoded,IP_encoded,url_encoded,domain_encoded,slotid_encoded,slotvisibility_encoded,slotformat_encoded,creative_encoded,keypage_encoded,usertag_encoded
0,0,5,22,2,2,2,200,200,5,238,...,31,253587,268016,21783,12845,2,0,104,0,744035
1,0,1,20,238,239,1,300,250,0,294,...,29,282720,145905,728,50771,6,3,9,18,744035
2,0,3,13,40,41,2,250,250,5,238,...,31,469429,262017,18426,9136,2,0,92,0,188248
3,0,6,23,1,1,1,160,600,0,300,...,31,107833,658015,4770,50450,2,1,111,12,706361
4,0,5,6,216,233,2,728,90,133,277,...,31,342191,157670,153,1517,8,3,84,18,744035


## 2. Preprocessing Validation Data 

In [11]:
df_validation = pd.read_csv('validation.csv')

# Drop cols are not needed 
df_validation = df_validation.drop(columns =['urlid','userid','bidid'])


#### Check for Null, Nan 

In [12]:
df_validation_cols = list(df_validation)

for col_name in df_validation_cols: 
    print( col_name ,df_validation[col_name].isnull().values.any())

click False
weekday False
hour False
useragent False
IP False
region False
city False
adexchange True
domain True
url True
slotid False
slotwidth False
slotheight False
slotvisibility False
slotformat False
slotprice False
creative False
bidprice False
payprice False
keypage True
advertiser False
usertag True


In [13]:
# Replace Null with 'unknown'
df_validation['domain'].fillna('unknown', inplace = True)
df_validation['url'].fillna('unknown', inplace = True)
df_validation['keypage'].fillna('unknown', inplace = True)
df_validation['usertag'].fillna('unknown', inplace = True)

# # Convert adexchange data type to int 
df_validation['adexchange'].fillna(0 , inplace = True )
df_validation.adexchange =  df_validation.adexchange.astype(int)

# Convert domain data type 
df_validation.domain = df_validation.domain.astype(str)

# Recheck for null
for col_name in df_validation_cols: 
    print( col_name ,df_validation[col_name].isnull().values.any())

click False
weekday False
hour False
useragent False
IP False
region False
city False
adexchange False
domain False
url False
slotid False
slotwidth False
slotheight False
slotvisibility False
slotformat False
slotprice False
creative False
bidprice False
payprice False
keypage False
advertiser False
usertag False


In [14]:
# Encode category data

validation_cat_cols = ['useragent','IP','url','domain','slotid','slotvisibility','slotformat','creative','keypage','usertag']
for cat_name in validation_cat_cols:
    print('endcoding', cat_name, '.....')
    df_validation[cat_name + '_encoded'] = lb_encode.fit_transform(df_validation[cat_name])

print()
print('Encoding done!')

endcoding useragent .....
endcoding IP .....
endcoding url .....
endcoding domain .....
endcoding slotid .....
endcoding slotvisibility .....
endcoding slotformat .....
endcoding creative .....
endcoding keypage .....
endcoding usertag .....

Encoding done!


In [15]:
df_val_prepared = df_validation.drop(columns = ['useragent','IP','domain','url','slotid','slotvisibility','slotformat','creative','keypage','usertag'])
df_val_prepared.to_pickle('df_val_prepared_updated')

In [16]:
df_val_prepared.head()

Unnamed: 0,click,weekday,hour,region,city,adexchange,slotwidth,slotheight,slotprice,bidprice,...,useragent_encoded,IP_encoded,url_encoded,domain_encoded,slotid_encoded,slotvisibility_encoded,slotformat_encoded,creative_encoded,keypage_encoded,usertag_encoded
0,0,4,20,79,79,1,160,600,0,300,...,28,137474,45239,8595,14275,2,1,109,12,110919
1,0,1,21,79,79,1,950,90,0,238,...,26,178167,109524,7201,14355,0,1,108,7,69995
2,0,4,8,2,2,2,300,250,5,238,...,28,52834,75356,8455,7129,2,0,99,14,108448
3,0,5,15,201,205,2,336,280,5,238,...,26,192739,27901,8931,8936,2,0,116,14,109043
4,0,1,18,134,135,2,200,200,5,249,...,26,55070,135142,6582,423,1,0,42,5,6344


# Train and Validation df prepared! 

- Now you can run notebooks ClickPrediction 
- After that you can run Comparision the models 
- If u save your predictions bid price as a list or dframe then we can test that too 