# Libraries

In [1]:
import gc
import pandas as pd

# Data

In [2]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed']

In [3]:
gc.collect()

train = pd.read_csv(
    '../input/train.csv',
    nrows=100000000,
    dtype=dtypes,
    engine='c',
    low_memory=True,
    usecols=train_cols
)

gc.collect()

0

# Exploration

In [4]:
train.shape

(100000000, 6)

In [5]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,83230,3,1,13,379,0
1,17357,3,1,19,379,0
2,35810,3,1,13,379,0
3,45745,14,1,13,478,0
4,161007,3,1,13,379,0


# Preprocessing

In [None]:
# def dataPreProcessTime(df):
#     df['click_time'] = pd.to_datetime(df['click_time']).dt.date
#     df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
# train = dataPreProcessTime(train)

In [6]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,83230,3,1,13,379,0
1,17357,3,1,19,379,0
2,35810,3,1,13,379,0
3,45745,14,1,13,478,0
4,161007,3,1,13,379,0


# Train/Test Split

In [7]:
from sklearn.cross_validation import train_test_split



In [8]:
features = ['ip', 'app', 'device', 'os', 'channel']
target = ['is_attributed']

X = train[features]
y = train[target]

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.01)

# Model

In [12]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import xgboost as xgb
import lightgbm as lgbm

from sklearn.metrics import log_loss, auc

In [14]:
gc.collect()

#model = XGBClassifier(n_jobs=1, n_estimators=2, learning_rate=0.05, max_depth=2)
model = LGBMClassifier(n_jobs=-1, n_estimators=400, learning_rate=0.05, max_depth=10, subsample=0.7)

model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.05, max_depth=10, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=400,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.7,
        subsample_for_bin=200000, subsample_freq=1)

In [15]:
y_prob = model.predict_proba(X_val)

print('Log loss:', log_loss(y_val, y_prob))

Log loss: 0.007077038479370579


# Generate Submission

In [None]:
del train, X_train, X_val, y_train, y_val
gc.collect()

In [16]:
test = pd.read_csv(
    '../input/test.csv',
    engine='c',
    low_memory=True,
)

test_cols = ['ip', 'app', 'device', 'os', 'channel']
X_test = test[test_cols]

In [17]:
submission = pd.DataFrame()
submission['click_id'] = test['click_id']
submission['is_attributed'] = model.predict_proba(X_test)[:,1]

In [18]:
submission.head()

Unnamed: 0,click_id,is_attributed
0,0,0.000372
1,1,0.000466
2,2,0.000192
3,3,0.000164
4,4,0.000156


In [19]:
submission.shape

(18790469, 2)

In [20]:
submission.to_csv('../submissions/lgb_simple_v3.csv', index=False)