# Intro

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import cyrtranslit
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model
from sklearn.pipeline import Pipeline
import lightgbm as lgb
color = sns.color_palette()
%matplotlib inline

In [3]:
train = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21107&authkey=AEh-8Y6p9SC7FK0",
                      compression='zip', parse_dates=["activation_date"])
test = pd.read_csv("https://onedrive.live.com/download?cid=62B3CEE436FDB342&resid=62B3CEE436FDB342%21106&authkey=AAF_zwBmWjNhNGQ",
                      compression='zip', parse_dates=["activation_date"])

# Model

Since the test data has no labels, we'll validate with two subsets of train. Lastly we'll use test to generate a submission file and get a public score.

## Basic Feature Engineering

- Translate all textual features into latin.
- Create dummies from categorical features.

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 18 columns):
item_id                 1503424 non-null object
user_id                 1503424 non-null object
region                  1503424 non-null object
city                    1503424 non-null object
parent_category_name    1503424 non-null object
category_name           1503424 non-null object
param_1                 1441848 non-null object
param_2                 848882 non-null object
param_3                 640859 non-null object
title                   1503424 non-null object
description             1387148 non-null object
price                   1418062 non-null float64
item_seq_number         1503424 non-null int64
activation_date         1503424 non-null datetime64[ns]
user_type               1503424 non-null object
image                   1390836 non-null object
image_top_1             1390836 non-null float64
deal_probability        1503424 non-null float64
dtypes: datetim

- Title and description have too many unique values, therefore we won't translate them all. 

In [5]:
print('Number of unique regions:',len(train.region.value_counts()))
print('Number of unique cities:',len(train.city.value_counts()))
print('Number of unique parent categories:',len(train.parent_category_name.value_counts()))
print('Number of unique categories:',len(train.category_name.value_counts()))
print('Number of unique descriptions:',len(train.description.value_counts()))
print('Number of unique titles:',len(train.title.value_counts()))
print('Number of unique param_1:',len(train.param_1.value_counts()))
print('Number of unique param_2:',len(train.param_2.value_counts()))
print('Number of unique param_3',len(train.param_3.value_counts()))

Number of unique regions: 28
Number of unique cities: 1733
Number of unique parent categories: 9
Number of unique categories: 47
Number of unique descriptions: 1317102
Number of unique titles: 788377
Number of unique param_1: 371
Number of unique param_2: 271
Number of unique param_3 1219


## Iterative Translation

In [6]:
# Description and title have too many unique values,
# Therefore this method would take too long.
cyr_vars = ['region','city','parent_category_name','category_name',
           'param_1','param_2','param_3']

for var in cyr_vars:
    for dataset in [train,test]:
        # Get unique cyrilic vlaues
        cyrilic_unique = np.unique(dataset[var].fillna('Blank')).tolist()
        # Get unique latin translations
        latin_unique = [cyrtranslit.to_latin(string,'ru') for string in cyrilic_unique]

        # Put lists in a dictionary
        trans_dict = {}
        for cyr, lat in zip(cyrilic_unique,latin_unique):
            trans_dict[cyr]=lat

        # Create a translated list
        en_list = []
        for key in dataset[var].fillna('Blank'):
            en_list.append(trans_dict[key])

        # Add english list as column
        dataset[str(var)+'_en'] = en_list
        dataset.drop(var,axis=1,inplace=True)

del cyrilic_unique,latin_unique,trans_dict,en_list,dataset,cyr_vars,var,cyr,lat

In [7]:
# See latin translations
print('Train Data:')
display(train.iloc[:3,-7:])
print("")
print('Test Data:')
display(test.iloc[:3,-7:])

Train Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Sverdlovskaja oblast',Ekaterinburg,Lichnye veszi,Tovary dlja detej i igrushki,Postel'nye prinadlezhnosti,Blank,Blank
1,Samarskaja oblast',Samara,Dlja doma i dachi,Mebel' i inter'er,Drugoe,Blank,Blank
2,Rostovskaja oblast',Rostov-na-Donu,Bytovaja ehlektronika,Audio i video,"Video, DVD i Blu-ray pleery",Blank,Blank



Test Data:


Unnamed: 0,region_en,city_en,parent_category_name_en,category_name_en,param_1_en,param_2_en,param_3_en
0,Volgogradskaja oblast',Volgograd,Lichnye veszi,Detskaja odezhda i obuv',Dlja mal'chikov,Obuv',25
1,Sverdlovskaja oblast',Nizhnjaja Tura,Hobbi i otdyh,Velosipedy,Dorozhnye,Blank,Blank
2,Novosibirskaja oblast',Berdsk,Bytovaja ehlektronika,Audio i video,Televizory i proektory,Blank,Blank


## Dummies
### City Features
- The number of unique cities in train is too large (1700) and leads to `MemoryError`. Let's pick a subset of those cities. The most useful might be those which appear in both train and test. Cities appearing only in train won't help predict anything in test that regards the city variable, and listings from cities which appear only in test can't be predicted with train information.

In [8]:
# Find which cities are in both train and test sets
test_unique = test.city_en.unique()
train_unique = train.city_en.unique()

common = [city for city in train_unique if city in test_unique]
del test_unique,train_unique

print('Unique comon cities:',len(common))

# Create features from the most popular cities in common set

# Get common cities in train and test
train_common = train[train.city_en.apply(lambda x: x in common)].city_en
test_common = test[test.city_en.apply(lambda x: x in common)].city_en

# Merge sets of common cities
traintest_common = train_common.append(test_common)
print('N Listings in both sets from unique common cities:',traintest_common.shape[0])

# Count values among merged set
top_common = traintest_common.value_counts()[:150].keys().tolist()
del common,train_common,test_common,traintest_common

Unique comon cities: 1625
N Listings in both sets from unique common cities: 2011447


- 1625 is still too many cities for feature purposes. Let's find which of those are the most popular among both sets. 

In [9]:
# Here we'll store our machine learning features
train_features = pd.DataFrame(index=train.index)
#test_features = pd.DataFrame(index=test.index)

In [10]:
del test

In [11]:
for city in top_common:
    train_features[str(city)] = np.where(train.city_en == city,1,0)
    
del top_common

### Feature Selection of Cities

### Variance Threshold

In [49]:
selector = feature_selection.VarianceThreshold(0.01)
n_features = selector.fit_transform(train_features).shape[1]
print('Number of Features above 0.01 Variance:', n_features)

Number of Features above 0.01 Variance: 29


- Variance Threshold doesn't take into account the target variable. At 0.01 variance, it removes all but 29 of cities. 

### Compare Feature Selection Techniques
- Feature Importances, Lasso L1, F Regression

In [53]:
estimator1 = ensemble.ExtraTreesRegressor()
selector1 = feature_selection.SelectFromModel(
    estimator1,
    threshold=-np.inf,
    max_features=n_features
)

estimator2 = linear_model.Lasso()
selector2 = feature_selection.SelectFromModel(
    estimator2,
    threshold=-np.inf,
    max_features=n_features
)

estimator3 = feature_selection.f_regression
selector3 = feature_selection.SelectKBest(
    estimator3,
    k=n_features
)

model = linear_model.LinearRegression()

pipe = Pipeline([('reduce_dim', None),('regression', model)])

grid = {'reduce_dim': [selector1,selector2,selector3]}

search = GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=grid)

index = np.random.choice(len(train),size=int(len(train)/7))
search.fit(train_features.iloc[index],train.iloc[index].deal_probability)

print(search.best_estimator_.named_steps['reduce_dim'].estimator)




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reduce_dim,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,112.951528,1.562672,0.024918,0.00518,SelectFromModel(estimator=ExtraTreesRegressor(...,{'reduce_dim': SelectFromModel(estimator=Extra...,0.001398,0.000977,0.000657,0.00133,0.00109,0.000297,2,0.002203,0.002274,0.002429,0.002279,0.002296,8.2e-05
1,1.159754,0.157788,0.033348,0.016042,"SelectFromModel(estimator=Lasso(alpha=1.0, cop...",{'reduce_dim': SelectFromModel(estimator=Lasso...,0.001468,0.00127,0.001236,0.001134,0.001277,0.000121,1,0.001529,0.001595,0.001633,0.001651,0.001602,4.7e-05
2,0.755045,0.164839,0.020064,0.007867,"SelectKBest(k=29, score_func=<function f_regre...","{'reduce_dim': SelectKBest(k=29, score_func=<f...",0.001384,0.00101,0.000583,0.00116,0.001034,0.000293,3,0.002184,0.002225,0.002417,0.002243,0.002267,8.9e-05


- Lasso returned the best features, based on the LinearRegression default score.

In [87]:
best = search.best_estimator_.named_steps['reduce_dim']

In [90]:
best.fit_transform(train_features,train.deal_probability)

(1503424, 29)

In [None]:
del var_f

In [91]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('train', 2177612018),
 ('train_features', 1804108904),
 ('index', 1718288),
 ('GridSearchCV', 1464),
 ('top_common', 1264),
 ('Pipeline', 1056),
 ('var_f', 296),
 ('grid', 240),
 ('color', 176),
 ('estimator3', 136),
 ('city', 83),
 ('ensemble', 80),
 ('feature_selection', 80),
 ('lgb', 80),
 ('linear_model', 80),
 ('metrics', 80),
 ('model_selection', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('preprocessing', 80),
 ('sns', 80),
 ('best', 56),
 ('estimator', 56),
 ('estimator1', 56),
 ('estimator2', 56),
 ('model', 56),
 ('pipe', 56),
 ('search', 56),
 ('selector', 56),
 ('selector1', 56),
 ('selector2', 56),
 ('selector3', 56),
 ('key', 54),
 ('n_features', 28)]

### Features: Param_1, 2 & 3
- Features from these are challenging because some values repeat in more than one of these variables. Therefore when creating dummies, column names overlap. Adding prefixes helps but some values exist only in train or test set, therefore each set ends with a different number of features. The solution is to gather a custom list of values to create features from.

### Features: Region, Parent Category, Category
- These can be done without as much juggling as `city` or `param_x`

In [12]:


# Categorical variables for dummies, except city.
cat_vars = train.columns[-7:].drop('city_en')

# Get dummies in both train and test
for var in cat_vars:
    for dataset in [train,test]:
        dummies = pd.get_dummies(dataset[var],prefix=var)
        if len(dataset) == len(train):
            train_features = train_features.join(dummies)
        elif len(dataset) == len(test):
            test_features = test_features.join(dummies)

In [None]:
train_features = train[['price_fill','item_seq_number','activation_date''image_top_1','deal_probability']].copy()
test_features = test[['price_fill','item_seq_number','activation_date''image_top_1']].copy()

cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train_features[col] = lbl.transform(list(train[col].values.astype('str')))
    test_features[col] = lbl.transform(list(test[col].values.astype('str')))

In [None]:
y_train = train["deal_probability"].values

# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image"]
train_X = train_df.drop(cols_to_drop + ["region_en", "parent_category_name_en", "category_name_en", "price_new", "deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

## Add more Features

- (Feature) has description
- (Feature) has photo
- (Feature) has param 1,2,3
- (Feature) has price
- (Feature) word count in title, description,
- (Feature) population of region
- (Feature) string value is unique. title, description, param1, etc...

In [None]:
# New variable on weekday #
#train["activation_weekday"] = train["activation_date"].dt.weekday
#test["activation_weekday"] = test["activation_date"].dt.weekday
