In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from collections import Counter
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
%matplotlib inline


In [2]:
traindata = pd.read_csv("train_NA17Sgz/train.csv")
testdata = pd.read_csv("test_aq1FGdB/test.csv")
submission = pd.read_csv("sample_submission_IPsBlCT/sample_submission.csv")
viewlog = pd.read_csv("train_NA17Sgz/view_log.csv")
itemdata = pd.read_csv("train_NA17Sgz/item_data.csv")
print(viewlog.info())
print(itemdata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3118622 entries, 0 to 3118621
Data columns (total 5 columns):
server_time    object
device_type    object
session_id     int64
user_id        int64
item_id        int64
dtypes: int64(3), object(2)
memory usage: 119.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132761 entries, 0 to 132760
Data columns (total 6 columns):
item_id         132761 non-null int64
item_price      132761 non-null int64
category_1      132761 non-null int64
category_2      132761 non-null int64
category_3      132761 non-null int64
product_type    132761 non-null int64
dtypes: int64(6)
memory usage: 6.1 MB
None


In [3]:
print(traindata.info())
print(testdata.info())
print(submission.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237609 entries, 0 to 237608
Data columns (total 7 columns):
impression_id      237609 non-null object
impression_time    237609 non-null object
user_id            237609 non-null int64
app_code           237609 non-null int64
os_version         237609 non-null object
is_4G              237609 non-null int64
is_click           237609 non-null int64
dtypes: int64(4), object(3)
memory usage: 12.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90675 entries, 0 to 90674
Data columns (total 6 columns):
impression_id      90675 non-null object
impression_time    90675 non-null object
user_id            90675 non-null int64
app_code           90675 non-null int64
os_version         90675 non-null object
is_4G              90675 non-null int64
dtypes: int64(3), object(3)
memory usage: 4.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90675 entries, 0 to 90674
Data columns (total 2 columns):
impression_id    90675 non-nul

In [4]:
traindata.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0


In [5]:
traindata.is_click.value_counts()/traindata.shape[0]*100

0    95.428624
1     4.571376
Name: is_click, dtype: float64

In [6]:
train_view = traindata.join(viewlog[["device_type","user_id"]],on="user_id",rsuffix="_r_")
train_view.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237609 entries, 0 to 237608
Data columns (total 9 columns):
impression_id      237609 non-null object
impression_time    237609 non-null object
user_id            237609 non-null int64
app_code           237609 non-null int64
os_version         237609 non-null object
is_4G              237609 non-null int64
is_click           237609 non-null int64
device_type        237609 non-null object
user_id_r_         237609 non-null int64
dtypes: int64(5), object(4)
memory usage: 16.3+ MB


In [7]:
test_view = testdata.join(viewlog[["device_type","user_id"]],on="user_id",rsuffix="_r_")
test_view.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90675 entries, 0 to 90674
Data columns (total 8 columns):
impression_id      90675 non-null object
impression_time    90675 non-null object
user_id            90675 non-null int64
app_code           90675 non-null int64
os_version         90675 non-null object
is_4G              90675 non-null int64
device_type        90675 non-null object
user_id_r_         90675 non-null int64
dtypes: int64(4), object(4)
memory usage: 5.5+ MB


## Dummy variables

In [8]:
batch_1_encoded = pd.get_dummies(train_view['os_version'], prefix='os_version_')
batch_2_encoded = pd.get_dummies(train_view['device_type'], prefix='device_type_')
exclude = ['is_click','os_version','impression_id','impression_time','user_id','user_id_r_','device_type','server_time','session_id','item_id']
traindata2 = train_view.loc[:,train_view.columns.difference(exclude)]
# # Row-bind (append) Encoded Data Back Together
final_traindata = pd.concat([batch_1_encoded,batch_2_encoded, traindata2], axis=1)

Xtrain = final_traindata.loc[:,]
ytrain = train_view['is_click']
print(Xtrain.shape)
print(ytrain.shape)
print(Xtrain.columns)

(237609, 8)
(237609,)
Index(['os_version__intermediate', 'os_version__latest', 'os_version__old',
       'device_type__android', 'device_type__iphone', 'device_type__web',
       'app_code', 'is_4G'],
      dtype='object')


In [9]:
batch_1_encoded = pd.get_dummies(test_view['os_version'], prefix='os_version_')
batch_2_encoded = pd.get_dummies(test_view['device_type'], prefix='device_type_')
exclude = ['is_click','os_version','impression_id','impression_time','user_id','user_id_r_','device_type','server_time','session_id','item_id']
testdata2 = test_view.loc[:,test_view.columns.difference(exclude)]
# # Row-bind (append) Encoded Data Back Together
final_testdata = pd.concat([batch_1_encoded, batch_2_encoded,testdata2], axis=1)

Xtest = final_testdata.loc[:,]
# Xtest['device_type__web'] = 0
Xtest.insert(5,"device_type__web",0)
ytest = []
print(Xtest.shape)
print(Xtest.columns)

(90675, 8)
Index(['os_version__intermediate', 'os_version__latest', 'os_version__old',
       'device_type__android', 'device_type__iphone', 'device_type__web',
       'app_code', 'is_4G'],
      dtype='object')


In [10]:
Xtest.tail()

Unnamed: 0,os_version__intermediate,os_version__latest,os_version__old,device_type__android,device_type__iphone,device_type__web,app_code,is_4G
90670,0,1,0,1,0,0,371,0
90671,0,1,0,1,0,0,151,1
90672,0,0,1,1,0,0,243,0
90673,0,1,0,1,0,0,272,1
90674,0,1,0,1,0,0,242,1


## Oversampling - Sampling is to be done on train data
### 1. Using random over sampling - ROS

In [11]:
print('Original dataset shape %s' % Counter(ytrain))
ros = RandomOverSampler(random_state=42)
ros_data_X, ros_data_y = ros.fit_resample(Xtrain, ytrain)
print('Resampled dataset shape %s' % Counter(ros_data_y))

Original dataset shape Counter({0: 226747, 1: 10862})
Resampled dataset shape Counter({0: 226747, 1: 226747})


### 2. Using random over sampling - SMOTE

In [12]:
print('Original dataset shape %s' % Counter(ytrain))
sos = SMOTE(random_state=0)
sos_data_X, sos_data_y = sos.fit_sample(Xtrain, ytrain)
print('Resampled dataset shape %s' % Counter(sos_data_y))


Original dataset shape Counter({0: 226747, 1: 10862})
Resampled dataset shape Counter({0: 226747, 1: 226747})


## Logistic Regression

In [24]:
#create an instance and fit the model 
logmodel = LogisticRegression(random_state=0)
# logmodel.fit(os_data_X, os_data_y.values.ravel())
logmodel.fit(sos_data_X, sos_data_y)
#predictions
predictions = logmodel.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = predictions
mysubmission.to_csv("mysubmission.csv",index=False)

In [25]:
predictions

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

## Random Forest Classifier

In [34]:
# random forest model creation
params = {'n_estimators': 500, 'min_samples_split' : 4, 'random_state' : 42}
rfc = RandomForestClassifier(**params)
rfc.fit(ros_data_X,ros_data_y)
# predictions
rfc_predict = rfc.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = rfc_predict
mysubmission.to_csv("mysubmission.csv",index=False)

In [41]:
Xtrain.columns

Index(['os_version__intermediate', 'os_version__latest', 'os_version__old',
       'app_code', 'is_4G'],
      dtype='object')

In [37]:
rfc.feature_importances_

array([0.00500015, 0.01051885, 0.00277542, 0.97577632, 0.00592926])

## Gradient Boosting Classifier

In [35]:
params = {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01}
model = GradientBoostingClassifier(**params)
model.fit(ros_data_X,ros_data_y)
y_predicted = model.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = rfc_predict
mysubmission.to_csv("mysubmission.csv",index=False)

## SVM

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') 

#Train the model using the training sets
clf.fit(ros_data_X,ros_data_y)

#Predict the response for test dataset
y_pred = clf.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = y_pred
mysubmission.to_csv("mysubmission.csv",index=False)

## XGB

In [17]:
Xtrain1 = pd.DataFrame(ros_data_X,columns=Xtrain.columns)
ytrain1 = pd.DataFrame(ros_data_y,columns=["is_click"])
alg = XGBClassifier(learning_rate=0.1, n_estimators=700, max_depth=5,seed=27)
alg.fit(Xtrain1,ytrain1)
y_pred = alg.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = y_pred
mysubmission.to_csv("mysubmission.csv",index=False)

In [37]:
Xtrain1 = pd.DataFrame(ros_data_X,columns=Xtrain.columns)
ytrain1 = pd.DataFrame(ros_data_y,columns=["is_click"])

alg = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
# alg.fit(sos_data_X,sos_data_y)
alg.fit(Xtrain1,ytrain1)
y_pred = alg.predict(Xtest)
mysubmission = pd.DataFrame()
mysubmission["impression_id"] = submission["impression_id"]
mysubmission["is_click"] = y_pred
mysubmission.to_csv("mysubmission2.csv",index=False)

Geekps  ghvh for  Geeks
for
Geekps  ghvh for  Geeks
