# Auctions Classification
---
Uses only listings whose model was extracted with confidence. 

In [18]:
from __future__ import division
import pandas as pd
import datetime
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import ShuffleSplit, learning_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
from sklearn.externals import joblib
import xgboost as xgb

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text


pd.set_option('display.max_columns', 60)

In [19]:
import pickle 

def save_obj(obj, name ):
    with open('./pickles/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./pickles/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    

## Import Data
---

**Import classification data**

In [20]:
# df_classification = pd.read_pickle('./pickles/df_classification_titles_tfidf.p')
# df_classification = pd.read_pickle('./pickles/df_classification_tfidf.p')
df_classification = pd.read_pickle('./pickles/df_classification_count_vec.p')


In [21]:
df_classification.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,bundle,photos hassle,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,sold_state
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,91.7,0,1,0.0,2.0,0,1,399.0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,1


## Train Test Split
---

In [22]:
X_class = df_classification.ix[:, :-1].values
y_class = df_classification.ix[:, -1].values
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, random_state=42, test_size=0.2)

In [23]:
del df_classification

## Modeling
---

## Classification

### Random Forest

In [8]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
rfc.fit(X_train_class, y_train_class)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=1, warm_start=False)

In [59]:
rfc_pred = rfc.predict(X_test_class)
rfc_proba = rfc.predict_proba(X_test_class)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished


In [170]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Model accuracy:',accuracy_score(y_test_class, rfc_pred)

Baseline Accuracy: 0.84756302521
Model accuracy: 0.903025210084


Random Forest Cross Val

In [175]:
rfc_scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=100), X=X_class, y=y_class, cv=3, scoring='accuracy', n_jobs=-1)
print np.mean(rfc_scores)

0.844867686184


Random Forest Feature importances

In [97]:
# sorted(zip(df_classification.columns.tolist(),list(rfc.feature_importances_)), key=lambda x: x[1], reverse=True)

**Export**

In [None]:
joblib.dump(rfc, './model_pickles/basic_random_forest_class.pkl') 

**Grid Search**

In [None]:
rfc_param_grid = {
    'n_estimators':[250,500],
    'min_samples_split':[2, 4, 6],
    'min_samples_leaf':[1,3,5],
    'max_depth':[4, 8]
}

gs_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rfc_param_grid, scoring='accuracy',n_jobs=-1,verbose=100)

gs_rfc.fit(X_train_class, y_train_class)

joblib.dump(gs_rfc.best_estimator_, './model_pickles/gs_rfc_class.pkl') 

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Memmaping (shape=(23799, 7737), dtype=float64) to new file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_91107_4589162640/91107-4613998992-c6810f481be8dea0d75934ec9aab82df.pkl
Pickling array (shape=(23799,), dtype=int64).
Pickling array (shape=(15865,), dtype=int64).
Pickling array (shape=(7934,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(23799, 7737), dtype=float64) to old file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_91107_4589162640/91107-4613998992-c6810f481be8dea0d75934ec9aab82df.pkl
Pickling array (shape=(23799,), dtype=int64).
Pickling array (shape=(15866,), dtype=int64).
Pickling array (shape=(7933,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(23799, 7737),

### Logistic Regression

In [54]:
lrc = LogisticRegression(penalty='l2',C=.8)
lrc.fit(X_train_class, y_train_class)

lrc_pred = lrc.predict(X_test_class)

print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Model accuracy:',accuracy_score(y_test_class, lrc_pred)

Baseline Accuracy: 0.84756302521
Model accuracy: 0.875462184874


**Cross Val**

In [102]:
lrc_scores = cross_val_score(estimator=LogisticRegression(penalty='l2',C=.8), X=X_class, y=y_class, cv=3, scoring='accuracy', n_jobs=-1)
print np.mean(lrc_scores)

0.781099028892


**Predict**

In [58]:
lrc_proba = lrc.predict_proba(X_test_class)

**Feature Importances**

In [103]:
lrc_feature_importances = sorted(zip(df_classification.columns.tolist()[:-1], list(lrc.coef_[0])), key=lambda x: x[1], reverse=True)

def save_obj(obj, name ):
    with open('./pickles/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

**Export**

In [98]:
joblib.dump(lrc, './model_pickles/basic_logistic_reg_class.pkl') 

['./model_pickles/basic_logistic_reg_class.pkl']

### XGBoost

In [43]:
xgbc = xgb.XGBClassifier(n_estimators=100)

xgbc.fit(X=X_train_class, y=y_train_class, verbose=True)

XGB_pred = xgbc.predict(X_test_class)

In [57]:
XGB_proba = xgbc.predict_proba(X_test_class)

**Export**

In [99]:
joblib.dump(xgbc, './model_pickles/basic_xgbc_class.pkl') 

['./model_pickles/basic_xgbc_class.pkl']

**Grid Search**

In [None]:
xgbc_param_grid = {
    'n_estimators':[400],
    'learning_rate':[0.1],
    'max_depth':[3, 5, 7]
}

gs_xgbc = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=xgbc_param_grid, scoring='accuracy',n_jobs=-1,verbose=100)

gs_xgbc.fit(X_train_class, y_train_class)

joblib.dump(gs_xgbc.best_estimator_, './model_pickles/gs_xgbc_class.pkl') 

## Combine Predictions

**Using Majoroity Rules of predictions**

In [55]:
import math 
y_pred = []
for i in range(XGB_pred.shape[0]):
    predictions = [XGB_pred[i], lrc_pred[i], rfc_pred[i]]
    if predictions.count(1) > math.floor(len(predictions)/2):
        y_pred.append(1)
    else:
        y_pred.append(0)

In [56]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Ensemble Maority Rules accuracy:',accuracy_score(y_test_class, y_pred)

Baseline Accuracy: 0.84756302521
Ensemble accuracy: 0.885210084034


**Using Geometric Mean of Prediction Probabilities **

In [66]:
from scipy.stats.mstats import gmean

proba_class1 = np.matrix((rfc_proba[:, 1], lrc_proba[:, 1], XGB_proba[:, 1]))

proba_class1 = np.ravel(gmean(proba_class1,axis=0))

y_pred = map(lambda x: 1 if x > 0.5 else 0, proba_class1)

In [67]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Ensemble Probability Gmean Accuracy:',accuracy_score(y_test_class, y_pred)

Baseline Accuracy: 0.84756302521
Ensemble Probability Gmean Accuracy: 0.895294117647


**KFold cross validate Gometric Mean Predictions**

In [68]:
num_splits = 3
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

rfc_kfold  = RandomForestClassifier(n_estimators=100, n_jobs=-1)
lrc_kfol   = LogisticRegression(penalty='l2',C=.8, n_jobs=-1)
xgbc_kfold = xgb.XGBClassifier(n_estimators=50, nthread=-1)

scores = []
for i,(train_indeces, test_indeces) in enumerate(skf.split(X_class,y_class)):
    print 'training #{} model out of {}.'.format(i, num_splits)

    X_train, X_test = X_class[train_indeces], X_class[test_indeces]
    y_train, y_test = y_class[train_indeces], y_class[test_indeces]
    
    rfc_kfold.fit(X_train, y_train)
    lrc_kfol.fit(X_train, y_train)
    xgbc_kfold.fit(X_train, y_train)
    
    rfc_kfold_proba_train = rfc_kfold.predict_proba(X_train)
    rfc_kfold_proba_test  = rfc_kfold.predict_proba(X_test)
    lrc_kfol_train        = lrc_kfol.predict_proba(X_train)
    lrc_kfol_test         = lrc_kfol.predict_proba(X_test)
    xgbc_kfold_train      = xgbc_kfold.predict_proba(X_train)
    xgbc_kfold_test       = xgbc_kfold.predict_proba(X_test)
    
    proba_class1 = np.matrix((rfc_kfold_proba_train[:, 1], lrc_kfol_train[:, 1], xgbc_kfold_train[:, 1]))
    proba_class1 = np.ravel(gmean(proba_class1,axis=0))
    y_pred_train = map(lambda x: 1 if x > 0.5 else 0, proba_class1)
    
    proba_class1 = np.matrix((rfc_kfold_proba_test[:, 1], lrc_kfol_test[:, 1], xgbc_kfold_test[:, 1]))
    proba_class1 = np.ravel(gmean(proba_class1,axis=0))
    y_pred_test = map(lambda x: 1 if x > 0.5 else 0, proba_class1)
    
    
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    scores.append((acc_train, acc_test))    

training #0 model out of 3.
training #1 model out of 3.
training #2 model out of 3.


In [69]:
sorted(scores, key=lambda x: x[0], reverse=True)

[(0.94599636950383215, 0.89573459715639814),
 (0.94433519891090612, 0.88674868898749493),
 (0.94418393586446836, 0.89179104477611937)]

In [78]:
print 'Baseline Accuracy:', y_test.sum()/y_test.shape[0]
print 'Cross Validated Ensemble GMean Prediction Accuracy:',gmean([score[1] for score in scores])
print 'Increase Accuracy due to model:',gmean([score[1] for score in scores]) - (y_test.sum()/y_test.shape[0])

Baseline Accuracy: 0.854881000403
Cross Validated Ensemble GMean Prediction Accuracy: 0.891417187736
Increase Accuracy due to model: 0.0365361873322


## Calculate profitability metric
---

**Subsample 2000 rows**

In [None]:
df_test = df_regression.sample(n=2000) # sub sample with all sold_state

df_test.head(2)

df_test['endPrice'].mean()

df_regression['endPrice'].sum()

In [9]:
prices = []
sold_probabilities = []
predicted_end_prices = []

for i in range(df_test.shape[0]): # actual 
# for i in range(3): # test

    if i % 100 == 0:
        print 'Processing {} out of {} rows.'.format(i, df_test.shape[0])
        
    # Grab vector
    item_vec = df_test.iloc[i]
    testStartPrice = item_vec.loc['startPrice']
    testEndPrice = item_vec.loc['endPrice']
    testSoldState = item_vec.loc['sold_state']
    
    # Set up parameters
    minStartPrice = 0
    maxStartPrice = testStartPrice * 2
    priceStepSize = testStartPrice / 5 
    
    max_expected_price = (0,0,0,0,0)
    # Loop through all start prices 
    for price_index,price in enumerate(np.arange(minStartPrice, maxStartPrice, priceStepSize)):
        # Grab test start price 
        item_vec.loc['startPrice'] = price
        
        # Classification        
        test_vec = item_vec.iloc[:-3].values.reshape(1,-1) # :-3 don't include CSM_start_price, sold_state, end_price
        sold_proba = rfc.predict_proba(test_vec)[0][1]
        sold_probabilities.append( (price, sold_proba) )
        
        # Regression
        test_vec = item_vec.iloc[:-2].values.reshape(1,-1) # don't include sold_state, end_price
        predicted_end_price = knn.predict(test_vec)[0]
        predicted_end_prices.append( (price, predicted_end_price) )
        
        # Find Optimal End Price
        if sold_proba > 0.5:
            expected_price = sold_probabilities[price_index][1] * predicted_end_prices[price_index][1] # Sold Probability * Predicted End Price
            if expected_price > max_expected_price[len(max_expected_price)-3]:
                max_expected_price = (price_index,\
                                      price,\
                                      sold_probabilities[price_index][1],\
                                      predicted_end_prices[price_index][1],\
                                      expected_price,\
                                      testSoldState,\
                                      testEndPrice)
    
    # If after looping through all potential start prices, there is no sold_proa > 0.5, then 
    if max_expected_price == (0,0,0,0,0):
        max_expected_price = (0, 0, 0, 0, 0, testSoldState, testEndPrice)
    
    prices.append(max_expected_price)

Processing 0 out of 2000 rows.
Processing 100 out of 2000 rows.
Processing 200 out of 2000 rows.
Processing 300 out of 2000 rows.
Processing 400 out of 2000 rows.
Processing 500 out of 2000 rows.
Processing 600 out of 2000 rows.
Processing 700 out of 2000 rows.
Processing 800 out of 2000 rows.
Processing 900 out of 2000 rows.
Processing 1000 out of 2000 rows.
Processing 1100 out of 2000 rows.
Processing 1200 out of 2000 rows.
Processing 1300 out of 2000 rows.
Processing 1400 out of 2000 rows.
Processing 1500 out of 2000 rows.
Processing 1600 out of 2000 rows.
Processing 1700 out of 2000 rows.
Processing 1800 out of 2000 rows.
Processing 1900 out of 2000 rows.


In [None]:
# print 'Optimal Predicted End Price:${}, \
#         Optimal Start Price:${}, \
#         Chance of Selling:{}, \
#         Expected Profit:${}'.format(max_expected_price[3],\
#                                     max_expected_price[1],\
#                                     max_expected_price[2],\
#                                     max_expected_price[4])

Use prices list of tuples to create average_profit_lift 

In [10]:
prices

[(8,
  240.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  203.0),
 (8,
  1.6000000000000001,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  235.0),
 (8,
  41.583999999999996,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  25.989999999999998),
 (8,
  15.984,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  25.579999999999998),
 (7,
  58.282000000000004,
  0.93000000000000005,
  270.83132826024348,
  251.87313528202645,
  1.0,
  50.0),
 (8,
  47.983999999999995,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  195.5),
 (8,
  0.016,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  343.99000000000001),
 (8,
  112.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  256.58999999999997),
 (8,
  640.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  475.0),
 (8,
  40.0,
  0.93000

In [11]:
# for price_index,price,sold_proba,pred_end_price,expected_price,testSoldState,testEndPrice in prices:
#     if sold_proba >= 0.5 and testSoldState == 1:
#         profit_diff = expected_price - testEndPrice
#     elif sold_proba >= 0.5 and testSoldState == 0:
#         profit_diff = expected_price - testEndPrice
#     elif sold_proba < 0.5 and testSoldState == 1:
#         profit_diff = expected_price - testEndPrice

profit_diff = [expected_price - testEndPrice for price_index,\
                                                   price,\
                                                   sold_proba,\
                                                    pred_end_price,\
                                                    expected_price,\
                                                   testSoldState,\
                                                   testEndPrice in prices]

average_profit_lift = np.mean(profit_diff)
    

In [12]:
average_profit_lift

36.419219057754646

## Grid Search

In [174]:
rfc_param_grid = {
    'n_estimators':[250,500],
    'min_samples_split':[2, 4, 6],
#     'max_features':['auto',0.3,0.5],
    'min_samples_leaf':[1,3,5],
    'max_depth':[4, 8]
}

gs = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rfc_param_grid, scoring='accuracy',n_jobs=-1,verbose=100)

gs.fit(X_class, y_class)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Memmaping (shape=(29749, 7737), dtype=float64) to new file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_83067_4757259600/83067-7787287504-d9d8a3f5d08b0bbe7363ff96b3929da7.pkl
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19832,), dtype=int64).
Pickling array (shape=(9917,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(29749, 7737), dtype=float64) to old file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_83067_4757259600/83067-7787287504-d9d8a3f5d08b0bbe7363ff96b3929da7.pkl
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19833,), dtype=int64).
Pickling array (shape=(9916,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(29749, 7737),

KeyboardInterrupt: 

In [None]:
print 'Best Estimator:',gs.best_estimator_
print 'Best Score:',gs.best_score_
print 'Best Parameters:',gs.best_params_

In [None]:
joblib.dump(gs.best_estimator_, './model_pickles/gs_random_forest.pkl') 

### Logistic Regression

### KNN

### SVM

** Model **

In [63]:
print 'top 5 features:'
feature_importances = sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)
sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)[:5]

top 5 features:


[('startPrice', 0.091041724275354508),
 ('concurrent_similar_median_start_price', 0.072779250151591424),
 (u'scratches body', 0.0065584151677305557),
 (u'50mm prime', 0.0048426325250845821),
 (u'255', 0.0041375394373774389)]

# Score Model

Baseline 

In [71]:
from sklearn.metrics import accuracy_score
# calculate baseline
y_true = y_train_class
y_pred = np.ones(y_true.shape[0])
print 'Baseline accuracy:',accuracy_score(y_true, y_pred)

Baseline accuracy: 0.853763351135


Cross val score

In [69]:
# scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), X=X_class, y=y_class, scoring='accuracy',cv=3, n_jobs=-1)
# print np.mean(scores)