# Libraries

In [None]:
import gc
import pandas as pd

# Data

In [None]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
#    'click_id'      : 'uint32'
}

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed']

In [None]:
train = pd.read_csv(
    '../input/train.csv',
    nrows=130000000,
    dtype=dtypes,
    engine='c',
    low_memory=True,
    usecols=train_cols
)

# Exploration

In [None]:
train.shape

In [None]:
train.head()

# Preprocessing

In [None]:
# def dataPreProcessTime(df):
#     df['click_time'] = pd.to_datetime(df['click_time']).dt.date
#     df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
# train = dataPreProcessTime(train)

In [None]:
train.head()

# Train/Test Split

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
features = ['ip', 'app', 'device', 'os', 'channel']
target = ['is_attributed']

X = train[features]
y = train[target]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.01)

In [None]:
del train, X, y
gc.collect()

# Model

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm

from sklearn.metrics import log_loss, auc

In [None]:
gc.collect()

#model = XGBClassifier(n_jobs=1, n_estimators=2, learning_rate=0.05, max_depth=2)

model = LGBMClassifier(
    n_jobs=-1,
    n_estimators=100,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.5,
    colsample_bytree=0.5,
    colsample_bylevel=0.5
)

#model = RandomForestClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_prob = model.predict_proba(X_val)

print('Log loss:', log_loss(y_val, y_prob))

# Generate Submission

In [None]:
del X_train, X_val, y_train, y_val
gc.collect()

In [None]:
test = pd.read_csv(
    '../input/test.csv',
    engine='c',
    low_memory=True,
)

test_cols = ['ip', 'app', 'device', 'os', 'channel']
X_test = test[test_cols]

In [None]:
submission = pd.DataFrame()
submission['click_id'] = test['click_id']
submission['is_attributed'] = model.predict_proba(X_test)[:,1]

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
submission.to_csv('../submissions/lgb_simple_v4.csv', index=False)