# Intro

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import cyrtranslit

from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb
color = sns.color_palette()
%matplotlib inline

In [2]:
train = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21107&authkey=AEh-8Y6p9SC7FK0",
                      compression='zip', parse_dates=["activation_date"])
test = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21106&authkey=AAF_zwBmWjNhNGQ",
                      compression='zip', parse_dates=["activation_date"])

# Model

Since the test data has no labels, we'll validate with two subsets of train. Lastly we'll use test to generate a submission file and get a public score.

## Basic Feature Engineering

- Translate all textual features into latin.
- Create dummies from categorical features.

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 18 columns):
item_id                 1503424 non-null object
user_id                 1503424 non-null object
region                  1503424 non-null object
city                    1503424 non-null object
parent_category_name    1503424 non-null object
category_name           1503424 non-null object
param_1                 1441848 non-null object
param_2                 848882 non-null object
param_3                 640859 non-null object
title                   1503424 non-null object
description             1387148 non-null object
price                   1418062 non-null float64
item_seq_number         1503424 non-null int64
activation_date         1503424 non-null datetime64[ns]
user_type               1503424 non-null object
image                   1390836 non-null object
image_top_1             1390836 non-null float64
deal_probability        1503424 non-null float64
dtypes: datetim

- Title and description have too many unique values, therefore we won't translate them all. 

In [4]:
print('Number of unique regions:',len(train.region.value_counts()))
print('Number of unique cities:',len(train.city.value_counts()))
print('Number of unique parent categories:',len(train.parent_category_name.value_counts()))
print('Number of unique categories:',len(train.category_name.value_counts()))
print('Number of unique descriptions:',len(train.description.value_counts()))
print('Number of unique titles:',len(train.title.value_counts()))
print('Number of unique param_1:',len(train.param_1.value_counts()))
print('Number of unique param_2:',len(train.param_2.value_counts()))
print('Number of unique param_3',len(train.param_3.value_counts()))

Number of unique regions: 28
Number of unique cities: 1733
Number of unique parent categories: 9
Number of unique categories: 47
Number of unique descriptions: 1317102
Number of unique titles: 788377
Number of unique param_1: 371
Number of unique param_2: 271
Number of unique param_3 1219


## Iterative Translation

In [5]:
# Description and title have too many unique values,
# Therefore this method would take too long.
cyr_vars = ['region','city','parent_category_name','category_name',
           'param_1','param_2','param_3']

for var in cyr_vars:
    for dataset in [train,test]:
        # Get unique cyrilic vlaues
        cyrilic_unique = np.unique(dataset[var].fillna('Blank')).tolist()
        # Get unique latin translations
        latin_unique = [cyrtranslit.to_latin(string,'ru') for string in cyrilic_unique]

        # Put lists in a dictionary
        trans_dict = {}
        for cyr, lat in zip(cyrilic_unique,latin_unique):
            trans_dict[cyr]=lat

        # Create a translated list
        en_list = []
        for key in dataset[var].fillna('Blank'):
            en_list.append(trans_dict[key])

        # Add english list as column
        dataset[str(var)+'_en'] = en_list

In [6]:
# See latin translations
print('Train Data:')
display(train.iloc[:3,-7:])
print("")
print('Test Data:')
display(test.iloc[:3,-7:])

Train Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Sverdlovskaja oblast',Ekaterinburg,Lichnye veszi,Tovary dlja detej i igrushki,Postel'nye prinadlezhnosti,Blank,Blank
1,Samarskaja oblast',Samara,Dlja doma i dachi,Mebel' i inter'er,Drugoe,Blank,Blank
2,Rostovskaja oblast',Rostov-na-Donu,Bytovaja ehlektronika,Audio i video,"Video, DVD i Blu-ray pleery",Blank,Blank



Test Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Volgogradskaja oblast',Volgograd,Lichnye veszi,Detskaja odezhda i obuv',Dlja mal'chikov,Obuv',25
1,Sverdlovskaja oblast',Nizhnjaja Tura,Hobbi i otdyh,Velosipedy,Dorozhnye,Blank,Blank
2,Novosibirskaja oblast',Berdsk,Bytovaja ehlektronika,Audio i video,Televizory i proektory,Blank,Blank


## Dummies
### City Features
- The number of unique cities in train is too large (1700) and leads to `MemoryError`. Let's pick a subset of those cities. The most useful might be those which appear in both train and test. Cities appearing only in train won't help predict anything in test that regards the city variable, and listings from cities which appear only in test can't be predicted with train information.

In [7]:
# Find which cities are in both train and test sets
test_unique = test.city_en.unique()
train_unique = train.city_en.unique()

common = [city for city in train_unique if city in test_unique]

print('Cities in both train and test:',len(common))

Cities in both train and test: 1625


- 1625 is still too many cities for feature purposes. Let's find which of those are the most popular among both sets. 

In [8]:
# Create features from the most popular cities in common set

# Get common cities in train and test
train_common = train[train.city_en.apply(lambda x: x in common)].city_en
test_common = test[test.city_en.apply(lambda x: x in common)].city_en

# Merge sets of common cities
traintest_common = train_common.append(test_common)
print('Listings in both sets from cities that appear in both sets:',traintest_common.shape[0])

# Count values among merged set
top_common = traintest_common.value_counts()[:300].keys().tolist()

Listings in both sets from cities that appear in both sets: 2011447


In [21]:
# Here we'll store our machine learning features
train_features = pd.DataFrame(index=train.index)
test_features = pd.DataFrame(index=test.index)

In [22]:
len(top_common)

300

In [23]:
for city in top_common:
    train_features[str(city)] = np.where(train.city_en == city,1,0)

MemoryError: 

### Features: Param_1, 2 & 3
- Features from these are challenging because some values repeat in more than one of these variables. Therefore when creating dummies, column names overlap. Adding prefixes helps but some values exist only in train or test set, therefore each set ends with a different number of features. The solution is to gather a custom list of values to create features from.

### Features: Region, Parent Category, Category
- These can be done without as much juggling as `city` or `param_x`

In [12]:


# Categorical variables for dummies, except city.
cat_vars = train.columns[-7:].drop('city_en')

# Get dummies in both train and test
for var in cat_vars:
    for dataset in [train,test]:
        dummies = pd.get_dummies(dataset[var],prefix=var)
        if len(dataset) == len(train):
            train_features = train_features.join(dummies)
        elif len(dataset) == len(test):
            test_features = test_features.join(dummies)

In [15]:
print('Train features:',train_features.shape[1])
print('Test features:',test_features.shape[1])

Train features: 1948
Test features: 1676


In [31]:
train_f = train_features.columns
test_f = test_features.columns

In [54]:
selection = pd.Series(train_f.isin(test_f)).apply(lambda x: not x)

In [55]:
train_f[selection]

Index(['param_1_en_Bobtejl', 'param_1_en_Francuzskaja ovcharka',
       'param_1_en_JAponskij bobtejl', 'param_1_en_La-perm laperm',
       'param_1_en_Leonberger', 'param_1_en_Lhasa Apso',
       'param_1_en_Norvich ter'er', 'param_1_en_Selkirk-reks',
       'param_1_en_Skotch-ter'er', 'param_1_en_SkyLink',
       ...
       'param_3_en_Xsara', 'param_3_en_Z3', 'param_3_en_Z4', 'param_3_en_ZX',
       'param_3_en_Zest', 'param_3_en_i10', 'param_3_en_i3', 'param_3_en_tC',
       'param_3_en_xA', 'param_3_en_xB'],
      dtype='object', length=335)

In [None]:
train_features = train[['price_fill','item_seq_number','activation_date''image_top_1','deal_probability']].copy()
test_features = test[['price_fill','item_seq_number','activation_date''image_top_1']].copy()

cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train_features[col] = lbl.transform(list(train[col].values.astype('str')))
    test_features[col] = lbl.transform(list(test[col].values.astype('str')))

In [None]:
y_train = train["deal_probability"].values

# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image"]
train_X = train_df.drop(cols_to_drop + ["region_en", "parent_category_name_en", "category_name_en", "price_new", "deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

## Add more Features

- (Feature) has description
- (Feature) has photo
- (Feature) has param 1,2,3
- (Feature) has price
- (Feature) word count in title, description,
- (Feature) population of region
- (Feature) string value is unique. title, description, param1, etc...

In [None]:
# New variable on weekday #
#train["activation_weekday"] = train["activation_date"].dt.weekday
#test["activation_weekday"] = test["activation_date"].dt.weekday
