# Load dataset

In [1]:
import gc
import pandas as pd

train = pd.read_parquet('../input/train.parquet', nthreads=4)

features = ['ip', 'app', 'device', 'os', 'channel']
target = ['is_attributed']

X = train[features]
y = train[target]

In [2]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,83230,3,1,13,379,0
1,17357,3,1,19,379,0
2,35810,3,1,13,379,0
3,45745,14,1,13,478,0
4,161007,3,1,13,379,0


# Playing with new features

In [3]:
train = train[:100000]

In [34]:
# Docs: https://stackoverflow.com/questions/24678308/how-to-find-location-with-ip-address-in-python

import ipaddress
from pygeoip import GeoIP, MEMORY_CACHE

gi = GeoIP("../dbs/GeoLiteCity.dat", MEMORY_CACHE)

def get_deoip(byte_ip, item):
    ip = str(ipaddress.IPv4Address(byte_ip))
    record = gi.record_by_addr(ip)
    return record[item] if record is not None else 0

postal_code = lambda x: get_deoip(x, 'postal_code')
area_code = lambda x: get_deoip(x, 'area_code')
longitude = lambda x: get_deoip(x, 'longitude')
latitude = lambda x: get_deoip(x, 'latitude')
dma_code = lambda x: get_deoip(x, 'dma_code')

In [35]:
for ip in train['ip']:
    postal_code = get_deoip(ip, "postal_code")
    if postal_code != 0:
        print(postal_code)

In [36]:
train

Unnamed: 0,ip,app,device,os,channel,is_attributed,postal_code,longitude
0,83230,3,1,13,379,0,0,0
1,17357,3,1,19,379,0,0,0
2,35810,3,1,13,379,0,0,0
3,45745,14,1,13,478,0,0,0
4,161007,3,1,13,379,0,0,0
5,18787,3,1,16,379,0,0,0
6,103022,3,1,23,379,0,0,0
7,114221,3,1,19,379,0,0,0
8,165970,3,1,13,379,0,0,0
9,74544,64,1,22,459,0,0,0


# Train/Test Split

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
features = ['ip', 'app', 'device', 'os', 'channel']
target = ['is_attributed']

X = train[features]
y = train[target]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.1)

In [None]:
del train, X, y
gc.collect()

# Model

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm

from sklearn.metrics import log_loss, auc, roc_auc_score

gc.collect()

In [None]:
# model = LGBMClassifier(
#     n_jobs=-1,
#     n_estimators=500,
#     learning_rate=0.05,
#     subsample=0.6,
# )

model = XGBClassifier(
    n_jobs=-1,
    n_estimators=200,
    subsample=0.75,
)

model.fit(X_train, y_train)

In [None]:
y_prob = model.predict_proba(X_val)
y_pred = model.predict(X_val)

In [None]:
print('Log loss:', log_loss(y_val, y_prob))
print('ROC-AUC score:', roc_auc_score(y_val, y_pred))

# Generate Submission

In [None]:
del X_train, X_val, y_train, y_val
gc.collect()

In [None]:
test = pd.read_parquet('../input/test.parquet')
test_cols = ['ip', 'app', 'device', 'os', 'channel']
X_test = test[test_cols]

In [None]:
submission = pd.DataFrame()
submission['click_id'] = test['click_id']
submission['is_attributed'] = model.predict_proba(X_test)[:,1]

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
submission.to_csv('../submissions/lgb_simple_v5.csv', index=False)