# Intro

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import cyrtranslit
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model
from sklearn.pipeline import Pipeline
import lightgbm as lgb
color = sns.color_palette()
%matplotlib inline

In [4]:
train = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21107&authkey=AEh-8Y6p9SC7FK0",
                      compression='zip', parse_dates=["activation_date"])
test = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21106&authkey=AAF_zwBmWjNhNGQ",
                      compression='zip', parse_dates=["activation_date"])

# Model

Since the test data has no labels, we'll validate with two subsets of train. Lastly we'll use test to generate a submission file and get a public score.

## Basic Feature Engineering

- Translate all textual features into latin.
- Create dummies from categorical features.

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 18 columns):
item_id                 1503424 non-null object
user_id                 1503424 non-null object
region                  1503424 non-null object
city                    1503424 non-null object
parent_category_name    1503424 non-null object
category_name           1503424 non-null object
param_1                 1441848 non-null object
param_2                 848882 non-null object
param_3                 640859 non-null object
title                   1503424 non-null object
description             1387148 non-null object
price                   1418062 non-null float64
item_seq_number         1503424 non-null int64
activation_date         1503424 non-null datetime64[ns]
user_type               1503424 non-null object
image                   1390836 non-null object
image_top_1             1390836 non-null float64
deal_probability        1503424 non-null float64
dtypes: datetim

- Title and description have too many unique values, therefore we won't translate them all. 

In [6]:
print('Number of unique regions:',len(train.region.value_counts()))
print('Number of unique cities:',len(train.city.value_counts()))
print('Number of unique parent categories:',len(train.parent_category_name.value_counts()))
print('Number of unique categories:',len(train.category_name.value_counts()))
print('Number of unique descriptions:',len(train.description.value_counts()))
print('Number of unique titles:',len(train.title.value_counts()))
print('Number of unique param_1:',len(train.param_1.value_counts()))
print('Number of unique param_2:',len(train.param_2.value_counts()))
print('Number of unique param_3',len(train.param_3.value_counts()))

Number of unique regions: 28
Number of unique cities: 1733
Number of unique parent categories: 9
Number of unique categories: 47
Number of unique descriptions: 1317102
Number of unique titles: 788377
Number of unique param_1: 371
Number of unique param_2: 271
Number of unique param_3 1219


## Iterative Translation

In [7]:
# Description and title have too many unique values,
# Therefore this method would take too long.
cyr_vars = ['region','city','parent_category_name','category_name',
           'param_1','param_2','param_3']

for var in cyr_vars:
    for dataset in [train,test]:
        # Get unique cyrilic vlaues
        cyrilic_unique = np.unique(dataset[var].fillna('Blank')).tolist()
        # Get unique latin translations
        latin_unique = [cyrtranslit.to_latin(string,'ru') for string in cyrilic_unique]

        # Put lists in a dictionary
        trans_dict = {}
        for cyr, lat in zip(cyrilic_unique,latin_unique):
            trans_dict[cyr]=lat

        # Create a translated list
        en_list = []
        for key in dataset[var].fillna('Blank'):
            en_list.append(trans_dict[key])

        # Add english list as column
        dataset[str(var)+'_en'] = en_list
        dataset.drop(var,axis=1,inplace=True)

del cyrilic_unique,latin_unique,trans_dict,en_list,dataset,cyr_vars,var,cyr,lat

In [8]:
# See latin translations
print('Train Data:')
display(train.iloc[:3,-7:])
print("")
print('Test Data:')
display(test.iloc[:3,-7:])

Train Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Sverdlovskaja oblast',Ekaterinburg,Lichnye veszi,Tovary dlja detej i igrushki,Postel'nye prinadlezhnosti,Blank,Blank
1,Samarskaja oblast',Samara,Dlja doma i dachi,Mebel' i inter'er,Drugoe,Blank,Blank
2,Rostovskaja oblast',Rostov-na-Donu,Bytovaja ehlektronika,Audio i video,"Video, DVD i Blu-ray pleery",Blank,Blank



Test Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Volgogradskaja oblast',Volgograd,Lichnye veszi,Detskaja odezhda i obuv',Dlja mal'chikov,Obuv',25
1,Sverdlovskaja oblast',Nizhnjaja Tura,Hobbi i otdyh,Velosipedy,Dorozhnye,Blank,Blank
2,Novosibirskaja oblast',Berdsk,Bytovaja ehlektronika,Audio i video,Televizory i proektory,Blank,Blank


## Dummies
### City Features
- The number of unique cities in train is too large (1700) and leads to `MemoryError`. Let's pick a subset of those cities. The most useful might be those which appear in both train and test. Cities appearing only in train won't help predict anything in test that regards the city variable, and listings from cities which appear only in test can't be predicted with train information.

In [9]:
# Find which cities are in both train and test sets
test_unique = test.city_en.unique()
train_unique = train.city_en.unique()

common = [city for city in train_unique if city in test_unique]
del test_unique,train_unique

print('Unique comon cities:',len(common))

# Create features from the most popular cities in common set

# Get common cities in train and test
train_common = train[train.city_en.apply(lambda x: x in common)].city_en
test_common = test[test.city_en.apply(lambda x: x in common)].city_en

# Merge sets of common cities
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common cities:',traintest_common.shape[0])

# Count values among merged set
top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

Unique comon cities: 1625
N Listings in both sets from unique common cities: 2011447


In [10]:
# Here we'll store our machine learning features
train_features = pd.DataFrame(index=train.index)

In [11]:
# Create features from top_common
for city in top_common:
    train_features[str(city)] = np.where(train.city_en == city,1,0)
del top_common

### Feature Selection of Cities

### Variance Threshold

In [12]:
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(train_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 29


- Variance Threshold doesn't take into account the target variable. At 0.01 variance, it removes all but 29 of cities. 

### Compare Feature Selection Techniques
- Feature Importances, Lasso L1, F Regression

In [14]:
estimator1 = ensemble.ExtraTreesRegressor()
selector1 = feature_selection.SelectFromModel(
    estimator1,
    threshold=-np.inf,
    max_features=n_features
)

estimator2 = linear_model.Lasso()
selector2 = feature_selection.SelectFromModel(
    estimator2,
    threshold=-np.inf,
    max_features=n_features
)

estimator3 = feature_selection.f_regression
selector3 = feature_selection.SelectKBest(
    estimator3,
    k=n_features
)

model = linear_model.LinearRegression()

pipe = Pipeline([('reduce_dim', None),('regression', model)])

grid = {'reduce_dim': [selector1,selector2,selector3]}

search = model_selection.GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(train_features.iloc[index],train.iloc[index].deal_probability)

print(search.best_estimator_.named_steps['reduce_dim'].estimator)
del index

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)


- Lasso returned the best features, based on the LinearRegression default score.

In [16]:
best = search.best_estimator_.named_steps['reduce_dim']
# Now fit Lasso with all the data
best.fit(train_features,train.deal_probability)
# Keep the best features
train_features = train_features.loc[:,best.get_support()]
# City variable no longer needed
train.drop('city_en',axis=1,inplace=True)

display(train_features.head())

Unnamed: 0,Ekaterinburg,Krasnodar,Novosibirsk,Nizhnij Novgorod,Rostov-na-Donu,CHeljabinsk,Kazan',Perm',Samara,Ufa,...,Irkutsk,Orenburg,Izhevsk,Sochi,Tol'jatti,Kemerovo,Belgorod,Tula,Naberezhnye CHelny,Stavropol'
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
test_features = pd.DataFrame(index=test.index)
features = train_features.columns.tolist()

# Create features 
for city in features:
    test_features[str(city)] = np.where(test.city_en == city,1,0)
test.drop('city_en',axis=1,inplace=True)

### Features: Param_1, 2 & 3
- Features from these are challenging because some values repeat in more than one of these variables. Therefore when creating dummies, column names overlap. Adding prefixes helps but some values exist only in train or test set, therefore each set ends with a different number of features. The solution is to gather a custom list of values to create features from.

### Param_1

In [79]:
# Find popular common values, create dummies, and get n_features

# Find unique common values in both sets
var = 'param_1_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common

# How many dummies > 1% variance?
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(new_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 18


In [80]:
# Compare feature selection methods
estimator1 = ensemble.ExtraTreesRegressor()
selector1 = feature_selection.SelectFromModel(
    estimator1,
    threshold=-np.inf,
    max_features=n_features
)
estimator2 = linear_model.Lasso()
selector2 = feature_selection.SelectFromModel(
    estimator2,
    threshold=-np.inf,
    max_features=n_features
)
estimator3 = feature_selection.f_regression
selector3 = feature_selection.SelectKBest(
    estimator3,
    k=n_features
)
# Based on generic linearregression, which feature-set has the best score?
model = linear_model.LinearRegression()
pipe = Pipeline([('reduce_dim', None),('regression', model)])
grid = {'reduce_dim': [selector1,selector2,selector3]}
search = model_selection.GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

# Undersample data for quick testing
index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(new_features.iloc[index],train.iloc[index].deal_probability)
best = search.best_estimator_.named_steps['reduce_dim']
print(best)
del index



SelectFromModel(estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False),
        max_features=18, norm_order=1, prefit=False, threshold=-inf)


- Extra trees regressor gave the best 18 features.
- In the next step, I won't fit all the data again because ensemble methods are time consuming.

In [94]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features)
# Variable no longer needed
train.drop(var,axis=1,inplace=True)

features = new_features.columns.tolist()

# Create features 
for f in features:
    test_features[str(f)] = np.where(test[var] == f,1,0)
test.drop(var,axis=1,inplace=True)

### Param_2

In [101]:
# Find popular common values, create dummies, and get n_features

# Find unique common values in both sets
var = 'param_2_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
top_common = traintest_common.value_counts()[:50].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common

# How many dummies > 1% variance?
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(new_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 13


In [102]:
# Compare feature selection methods
estimator1 = ensemble.ExtraTreesRegressor()
selector1 = feature_selection.SelectFromModel(
    estimator1,
    threshold=-np.inf,
    max_features=n_features
)
estimator2 = linear_model.Lasso()
selector2 = feature_selection.SelectFromModel(
    estimator2,
    threshold=-np.inf,
    max_features=n_features
)
estimator3 = feature_selection.f_regression
selector3 = feature_selection.SelectKBest(
    estimator3,
    k=n_features
)
# Based on generic linearregression, which feature-set has the best score?
model = linear_model.LinearRegression()
pipe = Pipeline([('reduce_dim', None),('regression', model)])
grid = {'reduce_dim': [selector1,selector2,selector3]}
search = model_selection.GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

# Undersample data for quick testing
index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(new_features.iloc[index],train.iloc[index].deal_probability)
best = search.best_estimator_.named_steps['reduce_dim']
print(best)
del index

SelectKBest(k=13, score_func=<function f_regression at 0x7fbd61ef6620>)


In [103]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features)
# Variable no longer needed
train.drop(var,axis=1,inplace=True)

features = new_features.columns.tolist()

# Create features 
for f in features:
    test_features[str(f)] = np.where(test[var] == f,1,0)
test.drop(var,axis=1,inplace=True)

### Param_3

In [107]:
# Find popular common values, create dummies, and get n_features

# Find unique common values in both sets
var = 'param_3_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
top_common = traintest_common.value_counts()[:50].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common

# How many dummies > 1% variance?
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(new_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 12


In [108]:
# Compare feature selection methods
estimator1 = ensemble.ExtraTreesRegressor()
selector1 = feature_selection.SelectFromModel(
    estimator1,
    threshold=-np.inf,
    max_features=n_features
)
estimator2 = linear_model.Lasso()
selector2 = feature_selection.SelectFromModel(
    estimator2,
    threshold=-np.inf,
    max_features=n_features
)
estimator3 = feature_selection.f_regression
selector3 = feature_selection.SelectKBest(
    estimator3,
    k=n_features
)
# Based on generic linearregression, which feature-set has the best score?
model = linear_model.LinearRegression()
pipe = Pipeline([('reduce_dim', None),('regression', model)])
grid = {'reduce_dim': [selector1,selector2,selector3]}
search = model_selection.GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

# Undersample data for quick testing
index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(new_features.iloc[index],train.iloc[index].deal_probability)
best = search.best_estimator_.named_steps['reduce_dim']
print(best)
del index

SelectKBest(k=12, score_func=<function f_regression at 0x7fbd61ef6620>)


In [113]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features,rsuffix=var)
# Variable no longer needed
train.drop(var,axis=1,inplace=True)

features = new_features.columns.tolist()

# Create features 
for f in features:
    test_features[str(f)] = np.where(test[var] == f,1,0)
test.drop(var,axis=1,inplace=True)

### Features: Region, Parent Category, Category
- Since these have very few unique values compared to `city` or `param_x`, I'll filter them out at once.

In [127]:
cat_vars = ['region_en','parent_category_name_en','category_name_en']
for var in cat_vars:
    print('\nTRAIN')
    print('Unique values for {}: '.format(var),len(np.unique(train[var])))
    print('TEST')
    print('Unique values for {}: '.format(var),len(np.unique(test[var])))



TRAIN
Unique values for region_en:  28
TEST
Unique values for region_en:  28

TRAIN
Unique values for parent_category_name_en:  9
TEST
Unique values for parent_category_name_en:  9

TRAIN
Unique values for category_name_en:  47
TEST
Unique values for category_name_en:  47


- Data in train and test is the same, therefore it's not necessary to find common unique values.

In [130]:
# Categorical variables for dummies, except city.
cat_vars = ['region_en','parent_category_name_en','category_name_en']

new_features = pd.DataFrame(index=train.index)

for var in cat_vars:
    for f in np.unique(test[var]):
        # Make dummies
        new_features[str(f)] = np.where(train[var] == f,1,0)

# How many dummies > 1% variance?
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(new_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 56


- 56 out of 84 new features will be kept.

In [145]:
# Compare feature selection methods
selector1.max_features,selector2.max_features,selector3.k = np.repeat(n_features,3)

# Based on generic linearregression, which feature-set has the best score?
model = linear_model.LinearRegression()
pipe = Pipeline([('reduce_dim', None),('regression', model)])
grid = {'reduce_dim': [selector1,selector2,selector3]}
search = model_selection.GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

# Undersample data for quick testing
index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(new_features.iloc[index],train.iloc[index].deal_probability)
best = search.best_estimator_.named_steps['reduce_dim']
print(best)
del index

SelectKBest(k=56, score_func=<function f_regression at 0x7fbd61ef6620>)


In [146]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features)
# Variable no longer needed
for var in cat_vars:
    train.drop(var,axis=1,inplace=True)

features = new_features.columns.tolist()

for var in cat_vars:
    for f in features:
        # Make dummies
        test_features[str(f)] = np.where(test[var] == f,1,0)
    test.drop(var,axis=1,inplace=True)

## Price Feature

In [147]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('train_features', 1539506280),
 ('train', 1377250683),
 ('new_features', 673534056),
 ('test_features', 516573112),
 ('test', 478668977),
 ('Pipeline', 1056),
 ('features', 512),
 ('grid', 240),
 ('color', 176),
 ('estimator3', 136),
 ('city', 103),
 ('cat_vars', 88),
 ('ensemble', 80),
 ('feature_selection', 80),
 ('lgb', 80),
 ('linear_model', 80),
 ('metrics', 80),
 ('model_selection', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('preprocessing', 80),
 ('sns', 80),
 ('f', 67),
 ('var', 65),
 ('common', 63),
 ('best', 56),
 ('estimator1', 56),
 ('estimator2', 56),
 ('model', 56),
 ('pipe', 56),
 ('search', 56),
 ('selector', 56),
 ('selector1', 56),
 ('selector2', 56),
 ('selector3', 56),
 ('key', 54),
 ('a', 32),
 ('b', 32),
 ('c', 32),
 ('d', 28),
 ('n_features', 28)]

In [152]:
train.columns

Index(['item_id', 'user_id', 'title', 'description', 'price',
       'item_seq_number', 'activation_date', 'user_type', 'image',
       'image_top_1', 'deal_probability'],
      dtype='object')

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

## Add more Features

- (Feature) has description
- (Feature) has photo
- (Feature) has param 1,2,3
- (Feature) has price
- (Feature) word count in title, description,
- (Feature) population of region
- (Feature) string value is unique. title, description, param1, etc...

In [None]:
# New variable on weekday #
#train["activation_weekday"] = train["activation_date"].dt.weekday
#test["activation_weekday"] = test["activation_date"].dt.weekday
