# Intro

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import cyrtranslit
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model
from sklearn.pipeline import Pipeline
import lightgbm as lgb
color = sns.color_palette()
%matplotlib inline

In [2]:
train = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21107&authkey=AEh-8Y6p9SC7FK0",
                      compression='zip', parse_dates=["activation_date"])
test = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21106&authkey=AAF_zwBmWjNhNGQ",
                      compression='zip', parse_dates=["activation_date"])

# Model

Since the test data has no labels, we'll validate with two subsets of train. Lastly we'll use test to generate a submission file and get a public score.

## Basic Feature Engineering

- Translate all textual features into latin.
- Create dummies from categorical features.

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 18 columns):
item_id                 1503424 non-null object
user_id                 1503424 non-null object
region                  1503424 non-null object
city                    1503424 non-null object
parent_category_name    1503424 non-null object
category_name           1503424 non-null object
param_1                 1441848 non-null object
param_2                 848882 non-null object
param_3                 640859 non-null object
title                   1503424 non-null object
description             1387148 non-null object
price                   1418062 non-null float64
item_seq_number         1503424 non-null int64
activation_date         1503424 non-null datetime64[ns]
user_type               1503424 non-null object
image                   1390836 non-null object
image_top_1             1390836 non-null float64
deal_probability        1503424 non-null float64
dtypes: datetim

- Title and description have too many unique values, therefore we won't translate them all. 

In [4]:
print('Number of unique regions:',len(train.region.value_counts()))
print('Number of unique cities:',len(train.city.value_counts()))
print('Number of unique parent categories:',len(train.parent_category_name.value_counts()))
print('Number of unique categories:',len(train.category_name.value_counts()))
print('Number of unique descriptions:',len(train.description.value_counts()))
print('Number of unique titles:',len(train.title.value_counts()))
print('Number of unique param_1:',len(train.param_1.value_counts()))
print('Number of unique param_2:',len(train.param_2.value_counts()))
print('Number of unique param_3',len(train.param_3.value_counts()))

Number of unique regions: 28
Number of unique cities: 1733
Number of unique parent categories: 9
Number of unique categories: 47
Number of unique descriptions: 1317102
Number of unique titles: 788377
Number of unique param_1: 371
Number of unique param_2: 271
Number of unique param_3 1219


## Iterative Translation

In [5]:
# Description and title have too many unique values,
# Therefore this method would take too long.
cyr_vars = ['region','city','parent_category_name','category_name',
           'param_1','param_2','param_3']

for var in cyr_vars:
    for dataset in [train,test]:
        # Get unique cyrilic vlaues
        cyrilic_unique = np.unique(dataset[var].fillna('Blank')).tolist()
        # Get unique latin translations
        latin_unique = [cyrtranslit.to_latin(string,'ru') for string in cyrilic_unique]

        # Put lists in a dictionary
        trans_dict = {}
        for cyr, lat in zip(cyrilic_unique,latin_unique):
            trans_dict[cyr]=lat

        # Create a translated list
        en_list = []
        for key in dataset[var].fillna('Blank'):
            en_list.append(trans_dict[key])

        # Add english list as column
        dataset[str(var)+'_en'] = en_list
        dataset.drop(var,axis=1,inplace=True)

del cyrilic_unique,latin_unique,trans_dict,en_list,dataset,cyr_vars,var,cyr,lat

In [6]:
# See latin translations
print('Train Data:')
display(train.iloc[:3,-7:])
print("")
print('Test Data:')
display(test.iloc[:3,-7:])

Train Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Sverdlovskaja oblast',Ekaterinburg,Lichnye veszi,Tovary dlja detej i igrushki,Postel'nye prinadlezhnosti,Blank,Blank
1,Samarskaja oblast',Samara,Dlja doma i dachi,Mebel' i inter'er,Drugoe,Blank,Blank
2,Rostovskaja oblast',Rostov-na-Donu,Bytovaja ehlektronika,Audio i video,"Video, DVD i Blu-ray pleery",Blank,Blank



Test Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Volgogradskaja oblast',Volgograd,Lichnye veszi,Detskaja odezhda i obuv',Dlja mal'chikov,Obuv',25
1,Sverdlovskaja oblast',Nizhnjaja Tura,Hobbi i otdyh,Velosipedy,Dorozhnye,Blank,Blank
2,Novosibirskaja oblast',Berdsk,Bytovaja ehlektronika,Audio i video,Televizory i proektory,Blank,Blank


## Dummies
### City Features
- The number of unique cities in train is too large (1700) and leads to `MemoryError`. Let's pick a subset of those cities. The most useful might be those which appear in both train and test. Cities appearing only in train won't help predict anything in test that regards the city variable, and listings from cities which appear only in test can't be predicted with train information.

In [7]:
# To store features selected
train_features = pd.DataFrame(index=train.index)
test_features = pd.DataFrame(index=test.index)

In [8]:
# Find popular common values, create dummies, and get n_features

# Find unique common values in both sets
var = 'city_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique
print('Unique comon values:',len(common))

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common values:',traintest_common.shape[0])

top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common
print('New features:',new_features.shape)

Unique comon values: 1625
N Listings in both sets from unique common values: 2011447
New features: (1503424, 150)


In [9]:
# Baseline score without feature selection
model = linear_model.LinearRegression()
cv = model_selection.cross_val_score(model,new_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score without feature selection:',score)

# Let the number of features depend on the potential score
n_features = int(len(new_features.columns)*(score**0.4))
print('N Features to select:',n_features)

Score without feature selection: 0.00230186731009178
N Features to select: 13


### Compare Feature Selection Techniques
- **Techniques:** Feature Importances, Lasso L1, Ridge, F Regression.
- Notice that feature importances will be fit with less data due to time consumption on tree-based models. It's hard to tell if it'd perform better with full dataset.

In [10]:
results = pd.DataFrame(index=['Importances','CoefLasso','CoefRidge','FRegression'],columns=['Score','Selector'])

In [11]:
# Score of SelectFromModel on Tree-based feature importances
selector1 = feature_selection.SelectFromModel(
    ensemble.ExtraTreesRegressor(),
    threshold=-np.inf,
    max_features=n_features)

# Undersample dataset to reduce time
index = np.random.choice(len(train),size=int(len(train)/7))
selector1.fit(new_features.iloc[index],train.iloc[index].deal_probability)
selection = new_features.iloc[:,selector1.get_support()]

# Score of these features
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Feature Importances:',np.mean(cv))
results.loc['Importances','Score'] = np.mean(cv)
results.loc['Importances','Selector']=selector1
del index, selection



Selection by Feature Importances: 0.0012934890283679372


In [12]:
# Score of SelectFromModel from Lasso coefs
selector2 = feature_selection.SelectFromModel(
    linear_model.Lasso(),
    threshold=-np.inf,
    max_features=n_features)
selector2.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector2.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Lasso Coefs:',np.mean(cv))
results.loc['CoefLasso','Score'] = np.mean(cv)
results.loc['CoefLasso','Selector']=selector2
del selection

Selection by Lasso Coefs: 0.0009250099923582278


In [13]:
# Score of SelectKBest from f_regression
selector3 = feature_selection.SelectKBest(
    feature_selection.f_regression,
    k=n_features)
selector3.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector3.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by f_regression:',np.mean(cv))
results.loc['FRegression','Score'] = np.mean(cv)
results.loc['FRegression','Selector']=selector3
del selection

Selection by f_regression: 0.0013005899234387701


In [14]:
# Score of SelectFromModel from Ridge coefs
selector4 = feature_selection.SelectFromModel(
    linear_model.Ridge(),
    threshold=-np.inf,
    max_features=n_features)
selector4.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector4.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Ridge Coefs:',np.mean(cv))
results.loc['CoefRidge','Score'] = np.mean(cv)
results.loc['CoefRidge','Selector']=selector4
del selection

Selection by Ridge Coefs: 0.0007214362864883158


In [15]:
display(results.sort_values(by='Score',ascending=False))
best = results.sort_values(by='Score',ascending=False).iloc[0,1]

Unnamed: 0,Score,Selector
FRegression,0.00130059,"SelectKBest(k=13, score_func=<function f_regre..."
Importances,0.00129349,SelectFromModel(estimator=ExtraTreesRegressor(...
CoefLasso,0.00092501,"SelectFromModel(estimator=Lasso(alpha=1.0, cop..."
CoefRidge,0.000721436,"SelectFromModel(estimator=Ridge(alpha=1.0, cop..."


In [16]:
best.get_support()

array([ True, False,  True, False,  True, False, False, False, False,
       False,  True,  True, False, False, False,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False,  True, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

- City features are ordered by descending popularity. Surprisingly, the Boolean mask shows many `False` at the beginning of the list. This means the cities with most popularity aren't necessarily the ones that add most predictive power.

In [17]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features)
# Variable no longer needed
train.drop(var,axis=1,inplace=True)

features = new_features.columns.tolist()

# Create features 
for f in features:
    test_features[str(f)] = np.where(test[var] == f,1,0)
test.drop(var,axis=1,inplace=True)

In [18]:
print(train.shape)
print(test.shape)
print(train_features.shape)
print(test_features.shape)

(1503424, 17)
(508438, 16)
(1503424, 13)
(508438, 13)


## Features: Param_1, 2 & 3
- Features from these are challenging because some values repeat in more than one of these variables. Therefore when creating dummies, column names overlap. Adding prefixes helps but some values exist only in train or test set, therefore each set ends with a different number of features. The solution is to gather a custom list of values to create features from.

### Param_1

In [19]:
# Find popular common values, create dummies

# Find unique common values in both sets
var = 'param_1_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique
print('Unique comon values:',len(common))

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common values:',traintest_common.shape[0])

top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common
print('New features:',new_features.shape)

Unique comon values: 362
N Listings in both sets from unique common values: 2011838
New features: (1503424, 150)


In [20]:
cv = model_selection.cross_val_score(model,new_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score without feature selection:',score)

# Let the number of features depend on the potential score
n_features = int(len(new_features.columns)*(score**0.4))
print('N Features to select:',n_features)

Score without feature selection: 0.15360099276817882
N Features to select: 70


In [21]:
# Score of SelectFromModel on Tree-based feature importances
selector1.max_features=n_features
index = np.random.choice(len(train),size=int(len(train)/7))
selector1.fit(new_features.iloc[index],train.iloc[index].deal_probability)
selection = new_features.iloc[:,selector1.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Feature Importances:',np.mean(cv))
results.loc['Importances','Score'] = np.mean(cv)
results.loc['Importances','Selector']=selector1
del index, selection



Selection by Feature Importances: 0.1497378230196806


In [22]:
# Score of SelectFromModel from Lasso coefs
selector2.max_features=n_features
selector2.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector2.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Coefs:',np.mean(cv))
results.loc['CoefLasso','Score'] = np.mean(cv)
results.loc['CoefLasso','Selector']=selector2
del selection

Selection by Coefs: 0.1354066173414947


In [23]:
# Score of SelectKBest from f_regression
selector3.k=n_features
selector3.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector3.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by f_regression:',np.mean(cv))
results.loc['FRegression','Score'] = np.mean(cv)
results.loc['FRegression','Selector']=selector3
del selection

Selection by f_regression: 0.14950788474007837


In [24]:
# Score of SelectFromModel from Ridge coefs
selector4.max_features=n_features
selector4.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector4.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Ridge Coefs:',np.mean(cv))
results.loc['CoefRidge','Score'] = np.mean(cv)
results.loc['CoefRidge','Selector']=selector4
del selection

Selection by Ridge Coefs: 0.13924541531070386


In [25]:
display(results.sort_values(by='Score',ascending=False))
best = results.sort_values(by='Score',ascending=False).iloc[0,1]

Unnamed: 0,Score,Selector
Importances,0.149738,SelectFromModel(estimator=ExtraTreesRegressor(...
FRegression,0.149508,"SelectKBest(k=70, score_func=<function f_regre..."
CoefRidge,0.139245,"SelectFromModel(estimator=Ridge(alpha=1.0, cop..."
CoefLasso,0.135407,"SelectFromModel(estimator=Lasso(alpha=1.0, cop..."


- Extra trees regressor gave the best set of features.
- Compared to the city variable, `param_1` has very strong predictive power. Knowing this alone, we can predict the deal probability with ~15% accuracy.
- By selecting 70 out of 150 features, we only lose ~1% predictive power.

In [26]:
best.get_support()

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False,  True, False,  True, False,
        True,  True, False,  True, False, False, False, False, False,
       False,  True,  True, False, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False,  True,  True,  True, False,  True, False,  True, False,
       False, False, False,  True,  True, False, False, False, False,
        True, False, False, False,  True,  True, False,  True,  True,
       False, False, False, False, False, False,  True, False, False,
       False, False,

- `True`s cleary cluster along the beginning of the list, so since features are ordered by popularity, high variance correlates with feature importance. However, the presence of some `True`s along the end of the list signifies that some important features also had very low variance.
- Note that popularity isn't always associated with feature importance. For the city variable, there were many more unpopular cities selected as important.

In [27]:
# Keep the best features
new_features = new_features.loc[:,best.get_support()]
# Add new features to train_features
train_features = train_features.join(new_features)

# Create test features 
features = new_features.columns.tolist()
for f in features:
    test_features[str(f)] = np.where(test[var] == f,1,0)

In [28]:
print(train.shape)
print(test.shape)
print(train_features.shape)
print(test_features.shape)

(1503424, 17)
(508438, 16)
(1503424, 83)
(508438, 83)


In [29]:
train.drop(var,axis=1,inplace=True)
test.drop(var,axis=1,inplace=True)

### Param_2

In [30]:
# Find popular common values, create dummies

# Find unique common values in both sets
var = 'param_2_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique
print('Unique comon values:',len(common))

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common values:',traintest_common.shape[0])

top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common
print('New features:',new_features.shape)

Unique comon values: 250
N Listings in both sets from unique common values: 2011801
New features: (1503424, 150)


In [31]:
cv = model_selection.cross_val_score(model,new_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score without feature selection:',score)

# Let the number of features depend on the potential score
n_features = int(len(new_features.columns)*(score**0.4))
print('N Features to select:',n_features)

Score without feature selection: 0.12048693867067557
N Features to select: 64


In [32]:
# Score of SelectFromModel on Tree-based feature importances
selector1.max_features=n_features
index = np.random.choice(len(train),size=int(len(train)/7))
selector1.fit(new_features.iloc[index],train.iloc[index].deal_probability)
selection = new_features.iloc[:,selector1.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Feature Importances:',np.mean(cv))
results.loc['Importances','Score'] = np.mean(cv)
results.loc['Importances','Selector']=selector1
del index, selection



Selection by Feature Importances: 0.11498425859927286


In [33]:
# Score of SelectFromModel from Lasso coefs
selector2.max_features=n_features
selector2.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector2.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Coefs:',np.mean(cv))
results.loc['CoefLasso','Score'] = np.mean(cv)
results.loc['CoefLasso','Selector']=selector2
del selection

Selection by Coefs: 0.10801743541266817


In [34]:
# Score of SelectKBest from f_regression
selector3.k=n_features
selector3.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector3.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by f_regression:',np.mean(cv))
results.loc['FRegression','Score'] = np.mean(cv)
results.loc['FRegression','Selector']=selector3
del selection

Selection by f_regression: 0.1155751083634029


In [35]:
# Score of SelectFromModel from Ridge coefs
selector4.max_features=n_features
selector4.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector4.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Ridge Coefs:',np.mean(cv))
results.loc['CoefRidge','Score'] = np.mean(cv)
results.loc['CoefRidge','Selector']=selector4
del selection

Selection by Ridge Coefs: 0.09184644501403638


In [36]:
display(results.sort_values(by='Score',ascending=False))
best = results.sort_values(by='Score',ascending=False).iloc[0,1]

Unnamed: 0,Score,Selector
FRegression,0.115575,"SelectKBest(k=64, score_func=<function f_regre..."
Importances,0.114984,SelectFromModel(estimator=ExtraTreesRegressor(...
CoefLasso,0.108017,"SelectFromModel(estimator=Lasso(alpha=1.0, cop..."
CoefRidge,0.0918464,"SelectFromModel(estimator=Ridge(alpha=1.0, cop..."


In [47]:
# Add new features to train_features
new_features = new_features.loc[:,best.get_support()]
train_features = train_features.join(new_features,lsuffix=var)

# Add same features to test
features = new_features.columns.tolist()
new_features = pd.DataFrame(index=test.index)
for f in features:
    new_features[str(f)] = np.where(test[var] == f,1,0)

test_features = test_features.join(new_features,lsuffix=var)

In [53]:
print(train.shape)
print(test.shape)
print(train_features.shape)
print(test_features.shape)

(1503424, 16)
(508438, 15)
(1503424, 147)
(508438, 147)


In [54]:
train.drop(var,axis=1,inplace=True)
test.drop(var,axis=1,inplace=True)

### Param_3

In [55]:
# Find popular common values, create dummies

# Find unique common values in both sets
var = 'param_3_en'
test_unique = test[var].unique()
train_unique = train[var].unique()
common = [unique for unique in train_unique if unique in test_unique]
del test_unique,train_unique
print('Unique comon values:',len(common))

# Find the top popular of those common values
train_common = train[train[var].apply(lambda x: x in common)][var]
test_common = test[test[var].apply(lambda x: x in common)][var]
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common values:',traintest_common.shape[0])

top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

# Make dummies from top common values
new_features = pd.DataFrame(index=train.index)
for common in top_common:
    new_features[str(common)] = np.where(train[var] == common,1,0)
del top_common
print('New features:',new_features.shape)

Unique comon values: 917
N Listings in both sets from unique common values: 2011112
New features: (1503424, 150)


In [56]:
cv = model_selection.cross_val_score(model,new_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score without feature selection:',score)

# Let the number of features depend on the potential score
n_features = int(len(new_features.columns)*(score**0.4))
print('N Features to select:',n_features)

Score without feature selection: 0.06968941294661561
N Features to select: 51


In [57]:
# Score of SelectFromModel on Tree-based feature importances
selector1.max_features=n_features
index = np.random.choice(len(train),size=int(len(train)/7))
selector1.fit(new_features.iloc[index],train.iloc[index].deal_probability)
selection = new_features.iloc[:,selector1.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Feature Importances:',np.mean(cv))
results.loc['Importances','Score'] = np.mean(cv)
results.loc['Importances','Selector']=selector1
del index, selection



Selection by Feature Importances: 0.061496034688184285


In [58]:
# Score of SelectFromModel from Lasso coefs
selector2.max_features=n_features
selector2.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector2.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Coefs:',np.mean(cv))
results.loc['CoefLasso','Score'] = np.mean(cv)
results.loc['CoefLasso','Selector']=selector2
del selection

Selection by Coefs: 0.061040005494917914


In [59]:
# Score of SelectKBest from f_regression
selector3.k=n_features
selector3.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector3.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by f_regression:',np.mean(cv))
results.loc['FRegression','Score'] = np.mean(cv)
results.loc['FRegression','Selector']=selector3
del selection

Selection by f_regression: 0.06195594433812276


In [60]:
# Score of SelectFromModel from Ridge coefs
selector4.max_features=n_features
selector4.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector4.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Ridge Coefs:',np.mean(cv))
results.loc['CoefRidge','Score'] = np.mean(cv)
results.loc['CoefRidge','Selector']=selector4
del selection

Selection by Ridge Coefs: 0.05178232894862874


In [61]:
display(results.sort_values(by='Score',ascending=False))
best = results.sort_values(by='Score',ascending=False).iloc[0,1]

Unnamed: 0,Score,Selector
FRegression,0.0619559,"SelectKBest(k=51, score_func=<function f_regre..."
Importances,0.061496,SelectFromModel(estimator=ExtraTreesRegressor(...
CoefLasso,0.06104,"SelectFromModel(estimator=Lasso(alpha=1.0, cop..."
CoefRidge,0.0517823,"SelectFromModel(estimator=Ridge(alpha=1.0, cop..."


In [62]:
best.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True, False,  True, False, False, False,  True,  True,
        True, False,  True, False, False,  True,  True, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False,  True,  True,  True, False,  True, False, False,
       False,  True, False,  True, False,  True, False,  True,  True,
        True, False,  True, False,  True, False, False, False, False,
       False,  True, False, False, False, False, False,  True, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [63]:
# Add new features to train_features
new_features = new_features.loc[:,best.get_support()]
train_features = train_features.join(new_features,lsuffix=var)

# Add same features to test
features = new_features.columns.tolist()
new_features = pd.DataFrame(index=test.index)
for f in features:
    new_features[str(f)] = np.where(test[var] == f,1,0)

test_features = test_features.join(new_features,lsuffix=var)

In [64]:
print(train.shape)
print(test.shape)
print(train_features.shape)
print(test_features.shape)

(1503424, 15)
(508438, 14)
(1503424, 198)
(508438, 198)


In [65]:
train.drop(var,axis=1,inplace=True)
test.drop(var,axis=1,inplace=True)

### Features: Region, Parent Category, Category
- Since these have very few unique values compared to `city` or `param_x`, I'll filter them out at once.

In [66]:
cat_vars = ['region_en','parent_category_name_en','category_name_en','user_type']
for var in cat_vars:
    print('\nTRAIN')
    print('Unique values for {}: '.format(var),len(np.unique(train[var])))
    print('TEST')
    print('Unique values for {}: '.format(var),len(np.unique(test[var])))


TRAIN
Unique values for region_en:  28
TEST
Unique values for region_en:  28

TRAIN
Unique values for parent_category_name_en:  9
TEST
Unique values for parent_category_name_en:  9

TRAIN
Unique values for category_name_en:  47
TEST
Unique values for category_name_en:  47


- Data in train and test is the same, therefore it's not necessary to find common unique values.

In [67]:
new_features = pd.DataFrame(index=train.index)

for var in cat_vars:
    for f in np.unique(test[var]):
        # Make dummies
        new_features[str(f)] = np.where(train[var] == f,1,0)

print('New features:',new_features.shape)

New features: (1503424, 84)


In [68]:
cv = model_selection.cross_val_score(model,new_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score without feature selection:',score)

# Let the number of features depend on the potential score
n_features = int(len(new_features.columns)*(score**0.4))
print('N Features to select:',n_features)

Score without feature selection: 0.12360141779556955
N Features to select: 36


In [69]:
# Score of SelectFromModel on Tree-based feature importances
selector1.max_features=n_features
index = np.random.choice(len(train),size=int(len(train)/7))
selector1.fit(new_features.iloc[index],train.iloc[index].deal_probability)
selection = new_features.iloc[:,selector1.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Feature Importances:',np.mean(cv))
results.loc['Importances','Score'] = np.mean(cv)
results.loc['Importances','Selector']=selector1
del index, selection



Selection by Feature Importances: 0.12155461164760051


In [70]:
# Score of SelectFromModel from Lasso coefs
selector2.max_features=n_features
selector2.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector2.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Coefs:',np.mean(cv))
results.loc['CoefLasso','Score'] = np.mean(cv)
results.loc['CoefLasso','Selector']=selector2
del selection

Selection by Coefs: 0.09474318099230832


In [71]:
# Score of SelectKBest from f_regression
selector3.k=n_features
selector3.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector3.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by f_regression:',np.mean(cv))
results.loc['FRegression','Score'] = np.mean(cv)
results.loc['FRegression','Selector']=selector3
del selection

Selection by f_regression: 0.1224656102249962


In [72]:
# Score of SelectFromModel from Ridge coefs
selector4.max_features=n_features
selector4.fit(new_features,train.deal_probability)
selection = new_features.iloc[:,selector4.get_support()]
cv = model_selection.cross_val_score(model,selection,train.deal_probability,cv=3)
print('Selection by Ridge Coefs:',np.mean(cv))
results.loc['CoefRidge','Score'] = np.mean(cv)
results.loc['CoefRidge','Selector']=selector4
del selection

Selection by Ridge Coefs: 0.1221920692494769


In [73]:
display(results.sort_values(by='Score',ascending=False))
best = results.sort_values(by='Score',ascending=False).iloc[0,1]

Unnamed: 0,Score,Selector
FRegression,0.122466,"SelectKBest(k=36, score_func=<function f_regre..."
CoefRidge,0.122192,"SelectFromModel(estimator=Ridge(alpha=1.0, cop..."
Importances,0.121555,SelectFromModel(estimator=ExtraTreesRegressor(...
CoefLasso,0.0947432,"SelectFromModel(estimator=Lasso(alpha=1.0, cop..."


In [74]:
best.get_support()

array([False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False,  True,  True,  True, False,
        True, False, False, False, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True, False,  True, False, False, False, False,  True, False,
        True, False, False,  True, False,  True,  True,  True, False,
        True,  True,  True])

In [75]:
# Add new features to train_features
new_features = new_features.loc[:,best.get_support()]
train_features = train_features.join(new_features,lsuffix=var)

# Add same features to test
features = new_features.columns.tolist()
new_features = pd.DataFrame(index=test.index)
for f in features:
    new_features[str(f)] = np.where(test[var] == f,1,0)

test_features = test_features.join(new_features,lsuffix=var)

In [76]:
print(train.shape)
print(test.shape)
print(train_features.shape)
print(test_features.shape)

(1503424, 14)
(508438, 13)
(1503424, 234)
(508438, 234)


In [78]:
for var in cat_vars:
    train.drop(var,axis=1,inplace=True)
    test.drop(var,axis=1,inplace=True)

- Let's take a look at the last features added

In [87]:
features

['Krasnojarskij kraj',
 "Sverdlovskaja oblast'",
 'Bytovaja ehlektronika',
 'Dlja doma i dachi',
 'Hobbi i otdyh',
 'Lichnye veszi',
 'Transport',
 'Uslugi',
 'ZHivotnye',
 'Audio i video',
 'Avtomobili',
 'Bytovaja tehnika',
 'CHasy i ukrashenija',
 "Detskaja odezhda i obuv'",
 'Drugie zhivotnye',
 'Igry, pristavki i programmy',
 'Knigi i zhurnaly',
 'Kollekcionirovanie',
 'Komnaty',
 'Koshki',
 "Krasota i zdorov'e",
 'Kvartiry',
 "Mebel' i inter'er",
 'Motocikly i mototehnika',
 "Muzykal'nye instrumenty",
 'Noutbuki',
 "Odezhda, obuv', aksessuary",
 'Predlozhenie uslug',
 'Pticy',
 'Sobaki',
 'Telefony',
 'Tovary dlja detej i igrushki',
 "Tovary dlja komp'jutera",
 'Velosipedy',
 'Vodnyj transport',
 "Zemel'nye uchastki"]

- It seems like only a handful of regions made it with enough importance.
- The rest important features are all product categories. Some include telephones, drugs, transportation, electronics, automobiles, etc.

# End

- The following columns will be worked in a different notebook

In [80]:
train.columns

Index(['item_id', 'user_id', 'title', 'description', 'price',
       'item_seq_number', 'activation_date', 'user_type', 'image',
       'image_top_1', 'deal_probability'],
      dtype='object')

In [77]:
train_features.to_pickle('catfeatures_train.pkl',compression='zip')
test_features.to_pickle('catfeatures_test.pkl',compression='zip')

In [178]:
train_features = pd.read_pickle('catfeatures_train.pkl',compression='zip')
test_features = pd.read_pickle('catfeatures_test.pkl',compression='zip')

In [93]:
train.to_pickle('train.pkl',compression='zip')
test.to_pickle('test.pkl',compression='zip')

In [88]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('train_features', 2814409832),
 ('train', 1378279207),
 ('test_features', 951796040),
 ('test', 478668977),
 ('new_features', 36082280),
 ('Pipeline', 1056),
 ('results', 772),
 ('features', 352),
 ('color', 176),
 ('cv', 120),
 ('cat_vars', 88),
 ('ensemble', 80),
 ('feature_selection', 80),
 ('lgb', 80),
 ('linear_model', 80),
 ('metrics', 80),
 ('model_selection', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('preprocessing', 80),
 ('sns', 80),
 ('common', 73),
 ('f', 67),
 ('var', 58),
 ('best', 56),
 ('model', 56),
 ('selector1', 56),
 ('selector2', 56),
 ('selector3', 56),
 ('selector4', 56),
 ('key', 54),
 ('score', 32),
 ('n_features', 24)]

## Add more Features

- (Feature) has description
- (Feature) has photo
- (Feature) has param 1,2,3
- (Feature) has price
- (Feature) word count in title, description,
- (Feature) population of region
- (Feature) string value is unique. title, description, param1, etc...

## How much can we predict with only categorical features?

In [92]:
cv = model_selection.cross_val_score(model,train_features,train.deal_probability,cv=3)
score = np.mean(cv)
print('Score with current features:',score)

Score with current features: 0.16618823247004413
