Modeling with auctions dataframe that has explicit brand,model,lens information

In [2]:
from __future__ import division
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import ShuffleSplit, learning_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text


pd.set_option('display.max_columns', 60)
pd.set_option('display.max_colwidth',100)

# Import Data
---

In [3]:
auctions = pd.read_pickle('./pickles/auctions_brand_model_hlens.p')

**Filter AUctions dataframe for listings where we are pretty certain we extracted the correct model**

IF YOU CHANGE THIS THRESHOLD, YOU MUST ALSO CHANGE THE THRESHOLD IN auctions-modeling-2!!!!

In [4]:
auctions = auctions[auctions['similarity_score']> 0.7]

# Extract Features
---

In [168]:
# STart TIme
start_time_series = auctions['listingInfo.startTime']
# End TIme
end_time_series = auctions['listingInfo.endTime']
# Brand
brand_dummies = pd.get_dummies(data=auctions['brand'])
# Model
model_dummies = pd.get_dummies(data=auctions['model'])
# Has Lens
has_lens_series = pd.Series(data=auctions['has_lens'], name='has_lens')
# Condition Display Name
auction_condition_display_name_dummies = pd.get_dummies(data=auctions['condition.conditionDisplayName'])
# Feedback percent
feedback_percent_series = pd.Series(auctions['sellerInfo.positiveFeedbackPercent'], name='feedback_percent')
# Top rated seller 
top_rated_seller_series = pd.Series(auctions['sellerInfo.topRatedSeller'].apply(lambda x: 1 if x==True else 0), name='top_rated_seller')
# Expedited Shipping
expedited_shipping_series = pd.Series(auctions['shippingInfo.expeditedShipping'], name='expedited_shipping')
# One day shipping 
one_day_shipping_series = pd.Series(auctions['shippingInfo.oneDayShippingAvailable'], name='one_day_shipping')
# Shipping cost
shipping_cost_series = pd.Series(auctions['shippingInfo.shippingServiceCost.value'], name='shipping_cost')
# Get free shipping status 
free_shipping_series = auctions['shippingInfo.shippingType'].apply(lambda x: 1 if x=='Free' else 0)
# Handling Time 
handling_time_series = pd.Series(auctions['shippingInfo.handlingTime'], name='handling_time')
# Returns accapted
returns_accepted_series = pd.Series(auctions['returnsAccepted'], name='returns_accepted')
# Start Price
start_price_series = auctions['startPrice']
# Similarity score
similarity_score_series = auctions['similarity_score']
# Soldb
sold = auctions['sold_state']
# End Price
end_price = auctions['endPrice']







# titles = auctions['title']
# condition_combined = auctions['conditionCombined']
# handling_time_series = pd.Series(auctions['shippingInfo.handlingTime'], name='handling_time')


# Create DataFrame

In [169]:
data_frames_to_keep = [start_time_series,\
                      end_time_series,\
                      brand_dummies,\
                      model_dummies,\
                      has_lens_series,\
                      auction_condition_display_name_dummies,\
                      feedback_percent_series,\
                      top_rated_seller_series,\
                      expedited_shipping_series,\
                      one_day_shipping_series,\
                       shipping_cost_series,\
                       free_shipping_series,\
                       handling_time_series,\
                       returns_accepted_series,\
                       start_price_series,\
                       similarity_score_series,\
                       sold]

df_classification = pd.concat(data_frames_to_keep, axis=1)

**impute missing values**

In [170]:
# Impute missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='most_frequent', axis=0)
df_classification['shipping_cost'] = imputer.fit_transform(df_classification['shipping_cost'].values.reshape(-1,1))

df_classification['handling_time'] = imputer.fit_transform(df_classification['handling_time'].values.reshape(-1,1))

print 'Null Rows %:',np.sum(df_classification.isnull().sum())/df_classification.shape[0]

Null Rows %: 0.0


In [171]:
df_classification.head(1)

Unnamed: 0,listingInfo.startTime,listingInfo.endTime,cannon,canon,casio,coleman,fujifilm,gopro,kodak,leica,nikon,olympus,panasonic,pentax,polaroid,ricoh,samsung,sigma,sony,1 v1,10d,160 / ixus 160,170 is / ixus 170,1d,1d mark ii,1d mark iii,1d mark iv,1ds mark ii,1ds mark iii,2 ek-gc200,...,xp60,xp90,xp95,xq1,xs / eos 1000d,xsi / eos 450d,xt,xt / eos 350d,xti,xti / eos 400d,xz-1,z340,has_lens,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,one_day_shipping,shipping_cost,shippingInfo.shippingType,handling_time,returns_accepted,startPrice,similarity_score,sold_state
0,2017-03-23 18:27:59,2017-03-30 18:27:59,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,91.7,0,1,0,0.0,0,2.0,1,399.0,0.804452,0


In [172]:
df_classification.to_pickle('./pickles/df_classification_explicit_features.p')

### Find Max, Median, Mean End Price for each model

In [159]:
data_frames_to_keep = [pd.Series(data=auctions['brand'], name='brand'),\
                      pd.Series(data=auctions['model'], name='model'),\
                      has_lens_series,\
                      auction_condition_display_name_dummies,\
                       start_price_series,\
                       similarity_score_series,\
                       sold,\
                       end_price]

df_end_price = pd.concat(data_frames_to_keep, axis=1)

In [160]:
df_end_price

Unnamed: 0,brand,model,has_lens,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,startPrice,similarity_score,sold_state,endPrice
0,canon,sl1 / eos 100d,0,0,0,1,0,0,0,399.00,0.804452,0,399.00
1,sony,dsc-rx100,0,0,0,1,0,0,0,300.00,1,1,369.00
2,canon,eos rebel t6,1,0,0,0,1,0,0,349.99,1,1,385.00
3,canon,eos rebel t6,1,0,0,1,0,0,0,0.99,1,1,387.00
4,sony,nex-f3,1,0,0,0,0,0,1,380.00,1,0,380.00
6,nikon,d2h,0,0,0,0,0,0,1,378.00,1,1,378.00
7,panasonic,dmc-g7,0,0,0,0,0,0,1,382.00,0.705792,0,382.00
8,sony,dsc-rx100 ii,0,0,0,0,1,0,0,150.00,0.883035,1,394.00
9,nikon,d3300,1,0,0,1,0,0,0,0.01,0.820714,1,377.00
10,sony,dsc-hx300,0,0,0,0,0,0,1,121.00,1,0,121.00


In [153]:
df_end_price['model_max_end_price'] = None
df_end_price['model_median_end_price'] = None
df_end_price['model_mean_end_price'] = None

In [161]:
model_end_price_dict = {}

for model in auctions['model'].unique().tolist():
    model_end_price_dict[model] = {}
    
    model_end_prices = df_end_price.ix[df_end_price['model']==model,'endPrice'].values
    
    model_end_price_dict[model]['max_end_price'] = np.max(model_end_prices)
    model_end_price_dict[model]['median_end_price'] = np.median(model_end_prices)
    model_end_price_dict[model]['mean_end_price'] = np.mean(model_end_prices)

In [162]:
import pickle 

def save_obj(obj, name ):
    with open('./pickles/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./pickles/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [164]:
save_obj(model_end_price_dict, 'model_end_price_dict')

In [154]:
for index in range(df_end_price.shape[0]):
#     if index%100 == 0:
#         print 'Calculating for index {} out of {}'.format(index, df_end_price.shape[0])
    
    df_index = df_end_price.index[index]
    
    model = df_end_price.ix[df_index,'model']
    
    df_end_price.set_value(index=df_index, col='model_max_end_price', value=model_end_price_dict[model]['max_end_price'])
    df_end_price.set_value(index=df_index, col='model_median_end_price', value=model_end_price_dict[model]['median_end_price'])
    df_end_price.set_value(index=df_index, col='model_mean_end_price', value=model_end_price_dict[model]['mean_end_price'])    
    
    

In [155]:
df_end_price.head(10)

Unnamed: 0,brand,model,has_lens,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,startPrice,similarity_score,sold_state,endPrice,model_max_end_price,model_median_end_price,model_mean_end_price
0,canon,sl1 / eos 100d,0,0,0,1,0,0,0,399.0,0.804452,0,399.0,812,323.0,336.541
1,sony,dsc-rx100,0,0,0,1,0,0,0,300.0,1.0,1,369.0,1000,47.5,113.217
2,canon,eos rebel t6,1,0,0,0,1,0,0,349.99,1.0,1,385.0,650,377.0,373.462
3,canon,eos rebel t6,1,0,0,1,0,0,0,0.99,1.0,1,387.0,650,377.0,373.462
4,sony,nex-f3,1,0,0,0,0,0,1,380.0,1.0,0,380.0,859,155.75,226.605
5,,lumix dmc-gh1,0,0,0,0,1,0,0,370.0,0.677605,1,370.0,1136,66.045,149.477
6,nikon,d2h,0,0,0,0,0,0,1,378.0,1.0,1,378.0,450,146.25,178.308
7,panasonic,dmc-g7,0,0,0,0,0,0,1,382.0,0.705792,0,382.0,700,444.44,458.893
8,sony,dsc-rx100 ii,0,0,0,0,1,0,0,150.0,0.883035,1,394.0,895,355.0,360.454
9,nikon,d3300,1,0,0,1,0,0,0,0.01,0.820714,1,377.0,751,341.0,343.172


In [125]:
df_end_price.to_pickle('./pickles/df_end_prices.p')

# Feature Engineering

## Train Test Split
---

In [51]:
X_class = df_classification.ix[:, 2:-1].values
y_class = df_classification.ix[:, -1].values
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, random_state=42, test_size=0.2)

**FEATURE ENGINEERING - Remove Features with Low Variance **

In [75]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.7)

vt.fit(X_class)

variances = zip(list(vt.variances_), df_classification.columns.tolist()[2:-1])

sorted(variances, key=lambda x: x[0], reverse=True)

[(40409.228042276394, 'startPrice'),
 (761.4865514703082, 'shipping_cost'),
 (508.3521718907827, 'feedback_percent'),
 (0.81419892977716257, 'handling_time'),
 (0.23862447918438678, 'canon'),
 (0.21055445058301514, 'nikon'),
 (0.20988415822211351, 0),
 (0.20988415822211351, 1),
 (0.20443815697610207, 'expedited_shipping'),
 (0.19805158166129461, 'returns_accepted'),
 (0.14741111536704626, 'shippingInfo.shippingType'),
 (0.13284461632372538, 'Used'),
 (0.12577533873146113, 'top_rated_seller'),
 (0.10016191585895308, 'sony'),
 (0.04765960259473228, 'panasonic'),
 (0.046734715218210952, 'New'),
 (0.045892078398183732, 'New other (see details)'),
 (0.041823159578001479, 'For parts or not working'),
 (0.033173798510921573, 'olympus'),
 (0.03108223954095022, 'fujifilm'),
 (0.024037868930086706, 'lumix dmc-gh1'),
 (0.021858218289456138, 'd3200'),
 (0.018771175291605147, 'alpha a6000'),
 (0.018681386407990087, 'd3100'),
 (0.017422511923280506, 'samsung'),
 (0.016701621264963714, 'eos 5d mark i

## Classification 
---

**Baseline**

In [54]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))

Baseline Accuracy: 0.887747957993


** Logistic Regression**

In [52]:
lrc = LogisticRegression()

scores = cross_val_score(estimator=lrc, X=X_class, y=y_class, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

print 'Average cross fold Classification Accuracy:', np.mean(scores)


Average cross fold Classification Accuracy: 0.847359261017


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


**Classification**

In [84]:
rfc_scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=75), X=X_class, y=y_class, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

print 'Average cross fold Classification Accuracy:', np.mean(rfc_scores)

Average cross fold Classification Accuracy: 0.893622799874


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.2s finished


In [80]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)

rfc.fit(X_train_class, y_train_class)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

**KNN**

In [57]:
knc_scores = cross_val_score(estimator=KNeighborsClassifier(n_neighbors=10), X=X_class, y=y_class, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

print 'Average cross fold Classification Accuracy:', np.mean(knc_scores)

Average cross fold Classification Accuracy: 0.868416114307


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.3s finished


**XGBoost**

In [83]:
# param = {'max_depth':15, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# num_round = 2
# bst = xgb.train(param, dtrain, num_round)
# # make prediction
# preds = bst.predict(dtest)

## Examine Errors