In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install turicreate

In [None]:
import turicreate as tc
from turicreate import load_sframe
from scipy.sparse import hstack
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Preprocessing

In [None]:
data_frame = tc.SFrame.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')

In [None]:
data_frame.head(2)

## Replace date & time with weekday & hour functions

In [None]:
def get_date(date):
    y = '20'+str(date)[:2]
    m = str(date)[2:4]
    d = str(date)[4:6]
    return y+'-'+m+'-'+d

In [None]:
def add_weekday_hour(data_frame):
    data_frame['weekday'] = data_frame['hour'].astype(str) \
    .apply(lambda x: get_date(x)).str_to_datetime().apply(lambda x: x.weekday())
    print('add_weekday_hour: half completed')
    data_frame['hour'] = data_frame['hour'].astype(str).apply(lambda x: x[6:]).astype(int)
    print('add_weekday_hour: completed')

## Preprocess for train and test frames

In [None]:
test_frame = tc.SFrame.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')

In [None]:
add_weekday_hour(test_frame)

In [None]:
test_frame.save('/content/drive/My Drive/Colab Notebooks/test_sframe_with_hour_and_date', format='binary')

In [None]:
test_frame = load_sframe('/content/drive/My Drive/Colab Notebooks/test_sframe_with_hour_and_date')

In [None]:
data_frame.save('/content/drive/My Drive/Colab Notebooks/train_sframe_with_hour_and_date', format='binary')

In [None]:
data_frame = load_sframe('/content/drive/My Drive/Colab Notebooks/train_sframe_with_hour_and_date')

In [None]:
data_frame = load_sframe('/content/drive/My Drive/Colab Notebooks/train_sframe_encoded')

In [None]:
data_frame.head(2)

In [None]:
test_frame.head(10)

In [None]:
data_frame['device_id'].value_counts()

value,count
a99f214a,33358308
0f7c61dc,21356
c357dbff,19667
936e92fb,13712
afeffc18,9654
987552d1,4187
28dc8687,4101
d857ffbb,4004
cef4c8cc,3694
b09da1c4,3655


In [None]:
device_id_dummy = 'a99f214a'
def add_user_id(df):
  df['user_id'] = df.apply(lambda x: x['device_ip'] + x['device_model'] if x['device_id'] == device_id_dummy else x['device_id'])
  df.remove_columns(['device_ip', 'device_model', 'device_id'], inplace=True)

In [None]:
add_user_id(data_frame)

In [None]:
add_user_id(test_frame)

In [None]:
data_frame['app_id'].value_counts().sort('count', ascending=False)

value,count
ecad2386,25832830
92f5800b,1555283
e2fcccd2,1129016
febd1138,759098
9c13b419,757812
7358e05e,615635
a5184c22,491457
d36838b1,457572
685d1c4c,408515
54c5d545,397693


In [None]:
data_frame['site_id'].value_counts().sort('count', ascending=False)

value,count
85f751fd,14596137
1fbe01fe,6486150
e151e245,2637747
d9750ee7,963745
5b08c53b,913325
5b4d2eda,771360
856e6d3f,765891
a7853007,461311
b7e9786d,369099
5ee41ff2,349008


In [None]:
dummy_site_id = '85f751fd'
def make_app_frame(df):
  app_frame = df.filter_by([dummy_site_id], column_name='site_id')
  app_frame.remove_columns(['site_id', 'site_domain',
       'site_category'], inplace=True)
  return app_frame

def make_site_frame(df):
  site_frame = df.filter_by([dummy_site_id], column_name='site_id', exclude=True)
  site_frame.remove_columns(['app_id', 'app_domain', 'app_category'], inplace=True)
  return site_frame

def save_frames(frames_to_names, frame_type):
  for frame in frames_to_names:
    frame.save('/content/drive/My Drive/Colab Notebooks/{}_{}_frame'.format(frame_type, frames_to_names[frame]), format='binary')

def load_frames(frame_names, frame_type):
   return [load_sframe('/content/drive/My Drive/Colab Notebooks/{}_{}_frame'.format(frame_type, name)) for name in frame_names] 

In [None]:
app_train_frame = make_app_frame(data_frame)
app_train_frame, app_valid_frame = app_train_frame.random_split(0.85)
app_test_frame =  make_app_frame(test_frame)

In [None]:
print(app_train_frame.head(1),app_valid_frame.head(1),app_test_frame.head(2))

In [None]:
print(app_train_frame.shape, app_valid_frame.shape, app_test_frame.shape)

(12405734, 20) (2190403, 20) (1719304, 19)


In [None]:
app_frames = {
    app_train_frame: 'train',
    app_valid_frame: 'valid',
    app_test_frame: 'test'
}
save_frames(frames, 'app')

In [None]:
app_train_frame, app_valid_frame, app_test_frame = load_frames(['train', 'valid', 'test'], 'app')

In [None]:
site_train_frame = make_site_frame(data_frame)
print('site train frame created')
site_train_frame, site_valid_frame = site_train_frame.random_split(0.85)
print('site valid frame created')
site_test_frame =  make_site_frame(test_frame)
print('site test frame created')

site train frame created
site valid frame created
site test frame created


In [None]:
print(site_train_frame.head(1), site_valid_frame.head(1), site_test_frame.head(2))
print(site_train_frame.shape, site_valid_frame.shape, site_test_frame.shape)

In [None]:
site_frames = {
    site_train_frame: 'train',
    site_valid_frame: 'valid',
    site_test_frame: 'test'
}
save_frames(site_frames, 'site')

In [None]:
site_train_frame, site_valid_frame, site_test_frame = load_frames(['train', 'valid', 'test'], 'site')

## Convert to libffm format

In [None]:
from bisect import bisect_left


class EncodeDataFrame:

  def __init__(self, df, valid_frame, test_frame):
    self.df = df
    self.valid_frame = valid_frame
    self.test_frame = test_frame
    self.dictionary_fields = {}
    for i in range(2, self.df.shape[1]):
      self.dictionary_fields[self.df.column_names()[i]] = i;
    self.cur_min = 50
    self.features_dict = {}

   
  def encode_feature(self, column, border):
    column_name = self.df.column_names()[column]

    self.feature_frame = self.df[column_name].value_counts().sort('count', ascending=True)

    last_ind = bisect_left(list(self.feature_frame['count']), border)
    encoding_column = [column] * last_ind
    new_min = self.cur_min + self.feature_frame.shape[0] - last_ind
    encoding_column.extend(list(range(self.cur_min, new_min)))
    self.cur_min = new_min
    print('total unique features values: {}, rejected: {}'.format(self.feature_frame.shape[0], last_ind))
    self.feature_frame.remove_column('count', inplace=True)
    self.feature_frame.add_column(tc.SArray(encoding_column), column_name='encoding', inplace=True)
    self.features_dict = self.feature_frame.to_dataframe().set_index('value').to_dict(orient='dict')['encoding']
    self.df[column_name] = self.df[column_name].apply(lambda x: self.features_dict[x])
    

  def encode_logger(self, column, x):
    if x in self.features_dict:
      return self.features_dict[x]
    else:
      print('value {} in {} column not found'.format(x, column))
      return column

  def encode_feature_test_valid(self, df, column):
    column_name = df.column_names()[column]
    df[column_name] = df[column_name].apply(lambda x: self.encode_logger(column, x))
      
  def encode_dataframe(self, border = 0):
    step = 1
    for i in range(2, self.df.shape[1], step):
      self.features_dict = {}
      for column in range(i, min(i + step, self.df.shape[1])):    
        self.encode_feature(column, border=border)
      print('encode_dataframe:', column, 'column done, cur min =', self.cur_min)

      for column in range(i, min(i + step, self.valid_frame.shape[1])):
        self.encode_feature_test_valid(self.valid_frame, column)
      print('encode_dataframe valid:', column, 'column done.')

      for column in range(i - 1, min(i + step - 1, self.test_frame.shape[1])):  
        self.encode_feature_test_valid(self.test_frame, column)
      print('encode_dataframe test:', column, 'column done.')

  def convert_and_save_frame_libffm_format(self, frame, name, frame_type):
    frame = frame.remove_column('id')
    for column in frame.column_names()[1:]:
      frame[column] = frame[column].apply(lambda x: str(self.dictionary_fields[column]) + ':' + str(x) + ':1') 
    print(name, 'dataframe converting done')
    frame.export_csv('/content/drive/My Drive/Colab Notebooks/libffm_format_{}_{}.csv'.format(frame_type, name), delimiter=' ', header=False)
    print(name, 'dataframe exported')
    

  def convert_to_libffm_format(self, frame_type):
    self.convert_and_save_frame_libffm_format(self.df, 'train', frame_type)
    self.convert_and_save_frame_libffm_format(self.valid_frame, 'vоalid', frame_type)
    self.convert_and_save_frame_libffm_format(self.test_frame, 'test', frame_type)
      

In [None]:
app_encoder = EncodeDataFrame(app_train_frame, app_valid_frame, app_test_frame)

In [None]:
app_encoder.encode_dataframe(border=2)

total features: 24, rejected features: 0
encode_dataframe: 2 column done, cur min = 74
encode_dataframe valid: 2 column done.
encode_dataframe test: 1 column done.
total features: 7, rejected features: 0
encode_dataframe: 3 column done, cur min = 81
encode_dataframe valid: 3 column done.
encode_dataframe test: 2 column done.
total features: 6, rejected features: 0
encode_dataframe: 4 column done, cur min = 87
encode_dataframe valid: 4 column done.
encode_dataframe test: 3 column done.
total features: 8291, rejected features: 1655
encode_dataframe: 5 column done, cur min = 6723
encode_dataframe valid: 5 column done.
encode_dataframe test: 4 column done.
total features: 541, rejected features: 129
encode_dataframe: 6 column done, cur min = 7135
encode_dataframe valid: 6 column done.
encode_dataframe test: 5 column done.
total features: 35, rejected features: 3
encode_dataframe: 7 column done, cur min = 7167
encode_dataframe valid: 7 column done.
encode_dataframe test: 6 column done.
tota

In [None]:
app_frames = {
    app_train_frame: 'train_encoded',
    app_valid_frame: 'valid_encoded',
    app_test_frame: 'test_encoded'
}
save_frames(frames, 'app')

In [None]:
app_encoder.convert_to_libffm_format('app')

train dataframe converting done
train dataframe exported
valid dataframe converting done
valid dataframe exported
test dataframe converting done
test dataframe exported


In [None]:
site_encoder = EncodeDataFrame(site_train_frame, site_valid_frame, site_test_frame)

In [None]:
site_encoder.encode_dataframe(border=5)

total unique features values: 24, rejected: 0
encode_dataframe: 2 column done, cur min = 74
encode_dataframe valid: 2 column done.
encode_dataframe test: 1 column done.
total unique features values: 5, rejected: 0
encode_dataframe: 3 column done, cur min = 79
encode_dataframe valid: 3 column done.
encode_dataframe test: 2 column done.
total unique features values: 5, rejected: 0
encode_dataframe: 4 column done, cur min = 84
encode_dataframe valid: 4 column done.
encode_dataframe test: 3 column done.
total unique features values: 4652, rejected: 1185
encode_dataframe: 5 column done, cur min = 3551
encode_dataframe valid: 5 column done.
encode_dataframe test: 4 column done.
total unique features values: 7450, rejected: 3304
encode_dataframe: 6 column done, cur min = 7697
encode_dataframe valid: 6 column done.
encode_dataframe test: 5 column done.
total unique features values: 26, rejected: 3
encode_dataframe: 7 column done, cur min = 7720
encode_dataframe valid: 7 column done.
encode_dat

In [None]:
site_frames = {
    site_train_frame: 'train_encoded',
    site_valid_frame: 'valid_encoded',
    site_test_frame: 'test_encoded'
}
save_frames(site_frames, 'site')

In [None]:
site_train_frame, site_valid_frame, site_test_frame = load_frames(['train_encoded', 'valid_encoded', 'test_encoded'], 'site')

In [None]:
print(site_encoder.df.head(1), site_encoder.valid_frame.head(1), site_encoder.test_frame.head(2))
print(site_encoder.df.shape,  site_encoder.valid_frame.shape, site_encoder.test_frame.shape)

In [None]:
site_encoder.convert_to_libffm_format('site')

train dataframe converting done
train dataframe exported
valid dataframe converting done
valid dataframe exported
test dataframe converting done
test dataframe exported


## Compose

In [None]:
dummy_site_id = '85f751fd'
eps = 10 ** -7

def get_frame_results(frame_type):
  test_frame = load_sframe('/content/drive/My Drive/Colab Notebooks/{frame_type}_test_frame'.format(frame_type=frame_type))
  results_frame = test_frame[['id']].to_dataframe()
  results_frame['click'] = tc.SFrame.read_csv('/content/drive/My Drive/Colab Notebooks/result_{frame_type}.csv'.format(frame_type=frame_type),
                                              header=False)['X1'].apply(lambda x: min(max(x, eps), 1 - eps))
  print(frame_type, 'processed')
  return results_frame

def predict():
  results_frame = get_frame_results('site')
  return results_frame.append(get_frame_results('app'))

In [None]:
results_frame = predict()
results_frame

In [None]:
results_frame.to_csv('/content/drive/My Drive/Colab Notebooks/results_frame_ffm.csv.gz', compression='gzip', index=False)