# Auctions Class/Reg
---
Uses only listings whose model was extracted with confidence. 

In [1]:
from __future__ import division
import pandas as pd
import datetime
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import ShuffleSplit, learning_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
from sklearn.externals import joblib

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text


pd.set_option('display.max_columns', 60)

In [2]:
import pickle 

def save_obj(obj, name ):
    with open('./pickles/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./pickles/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    

### Import data

In [43]:
auctions = pd.read_pickle('./pickles/auctions_brand_model_hlens.p')

explicit_features = pd.read_pickle('./pickles/df_classification_explicit_features.p')

model_end_price_dict = load_obj('model_end_price_dict')

In [9]:
print auctions.shape
print explicit_features.shape

(29961, 54)
(29961, 525)


(optional) Concurrent listing information 

In [29]:
concurrent_similar_median_start_price_series = pd.read_pickle('./feature-engineering-concurrent-similar-median-start-price/pickles/auctions_median_start_price_series.p')
concurrent_similar_median_start_price_series.name = 'concurrent_similar_median_start_price'

In [30]:
# Count NaN values
print 'Total Values:',concurrent_similar_median_start_price_series.shape[0]
print 'Non NaN Values:',concurrent_similar_median_start_price_series.count()
print 'NaN Values:',concurrent_similar_median_start_price_series.shape[0] - concurrent_similar_median_start_price_series.count()

Total Values: 29961
Non NaN Values: 22376
NaN Values: 7585


In [31]:
# Replace missing concurrent median start prices with the median overall start price
concurrent_similar_median_start_price_series[concurrent_similar_median_start_price_series.isnull()] = np.median(auctions['startPrice'])

### Extract Features

In [65]:
# Title 
titles = auctions['title']
# Start Time 
start_time_series = auctions['listingInfo.startTime']
# End TIme
end_time_series = auctions['listingInfo.endTime']
# Brand
brand_dummies = pd.get_dummies(data=auctions['brand'])
# Model
model_series = pd.Series(data=auctions['model'], name='model_series')
# Model Dummies
model_dummies = pd.get_dummies(data=auctions['model'])
# Has Lens
has_lens_series = pd.Series(data=auctions['has_lens'], name='has_lens')
# Condition Display Name
auction_condition_display_name_dummies = pd.get_dummies(data=auctions['condition.conditionDisplayName'])
# Feedback percent
feedback_percent_series = pd.Series(auctions['sellerInfo.positiveFeedbackPercent'], name='feedback_percent')
# Top rated seller 
top_rated_seller_series = pd.Series(auctions['sellerInfo.topRatedSeller'].apply(lambda x: 1 if x==True else 0), name='top_rated_seller')
# Expedited Shipping
expedited_shipping_series = pd.Series(auctions['shippingInfo.expeditedShipping'], name='expedited_shipping')
# One day shipping 
one_day_shipping_series = pd.Series(auctions['shippingInfo.oneDayShippingAvailable'], name='one_day_shipping')
# Shipping cost
shipping_cost_series = pd.Series(auctions['shippingInfo.shippingServiceCost.value'], name='shipping_cost')
# Get free shipping status 
free_shipping_series = auctions['shippingInfo.shippingType'].apply(lambda x: 1 if x=='Free' else 0)
# Handling Time 
handling_time_series = pd.Series(auctions['shippingInfo.handlingTime'], name='handling_time')
# Returns accapted
returns_accepted_series = pd.Series(auctions['returnsAccepted'], name='returns_accepted')
# Start Price
start_price_series = auctions['startPrice']
# Similarity score
similarity_score_series = auctions['similarity_score']
# Soldb
sold = auctions['sold_state']
# End Price
end_price = auctions['endPrice']



# condition_combined = auctions['conditionCombined']
# handling_time_series = pd.Series(auctions['shippingInfo.handlingTime'], name='handling_time')



Clean text

Titles

In [177]:
clean_titles = []
for i,title in enumerate(titles.values):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(titles.values))
    clean_titles.append(clean_text(title))

cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents
cleaning #25000 out of 29961 documents


In [178]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),
                             min_df=5,
                             analyzer='word',
                             stop_words=None,
                             max_features=10000,
                            )

titles_matrix = vectorizer.fit_transform(clean_titles)

import operator
# print sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:5]

titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [68]:
del clean_titles
del titles_matrix

Conditions

In [13]:
clean_conditions = []
for i,cond in enumerate(condition_combined):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_conditions.append(clean_text(cond))

NameError: name 'condition_combined' is not defined

In [35]:
vectorizer = CountVectorizer(ngram_range = (1,2),
                             min_df=30,
                             analyzer='word',
                             stop_words=None,
                             max_features=5000,
                            )

conditions_matrix = vectorizer.fit_transform(clean_conditions)

import operator
# print 'most common condition grams:',sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:5]
# print 'least common condition grams:',sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=False)[:5]

conditions_df = pd.DataFrame(conditions_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [24]:
del clean_conditions
del conditions_matrix

NameError: name 'clean_conditions' is not defined

## Create Processed DataFrame
---

**Create processed feature dataframe**

In [179]:
# data_frames_to_keep = [titles_df,\
#                        conditions_df,\
#                        auction_condition_dummies,\
#                        feedback_percent_series,\
#                        top_rated_seller_series,\
#                        expedited_shipping_series,\
#                        shipping_cost_series,\
#                        handling_time_series,\
#                        one_day_shipping_series,\
#                        returns_accepted_series,\
#                        start_price_series,\
#                        sold]

data_frames_to_keep = [titles_df,\
#                        start_time_series,\
#                        end_time_series,\
#                        brand_dummies,\
#                        model_dummies,\
#                        model_series,\
                       similarity_score_series,\
                       has_lens_series,\
                       auction_condition_display_name_dummies,\
                       feedback_percent_series,\
                       top_rated_seller_series,\
                       expedited_shipping_series,\
                       one_day_shipping_series,\
                       shipping_cost_series,\
                       free_shipping_series,\
                       handling_time_series,\
                       returns_accepted_series,\
                       start_price_series,\
                       sold]

df_classification = pd.concat(data_frames_to_keep, axis=1)

In [180]:
df_classification = df_classification[df_classification['similarity_score']>0.7]

In [181]:
# Impute missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='most_frequent', axis=0)
df_classification['shipping_cost'] = imputer.fit_transform(df_classification['shipping_cost'].values.reshape(-1,1))

In [182]:
df_classification['handling_time'] = imputer.fit_transform(df_classification['handling_time'].values.reshape(-1,1))

In [183]:
print 'Null Rows %:',np.sum(df_classification.isnull().sum())/df_classification.shape[0]

Null Rows %: 0.0


In [184]:
del data_frames_to_keep

In [138]:
for index in range(df_classification.shape[0]):

    df_index = df_classification.index[index]
    
    model = df_classification.ix[df_index,'model_series']
    
    df_classification.set_value(index=df_index, col='model_max_end_price', value=model_end_price_dict[model]['max_end_price'])
    df_classification.set_value(index=df_index, col='model_median_end_price', value=model_end_price_dict[model]['median_end_price'])
    df_classification.set_value(index=df_index, col='model_mean_end_price', value=model_end_price_dict[model]['mean_end_price'])    
    
    

In [139]:
df_classification.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,eu,vario asph,focusing screen,135mm,camera near,ixus 850,listingInfo.startTime,listingInfo.endTime,model_series,similarity_score,has_lens,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,one_day_shipping,shipping_cost,shippingInfo.shippingType,handling_time,returns_accepted,startPrice,sold_state,model_max_end_price,model_median_end_price,model_mean_end_price
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2017-03-23 18:27:59,2017-03-30 18:27:59,sl1 / eos 100d,0.804452,0,0,0,1,0,0,0,91.7,0,1,0,0.0,0,2.0,1,399.0,0,812.0,290.0,334.401449
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2017-03-23 23:26:46,2017-03-30 23:26:46,dsc-rx100,1.0,0,0,0,1,0,0,0,100.0,0,1,0,0.0,0,3.0,0,300.0,1,960.0,256.0,290.977157


In [185]:
# df_classification.drop(labels=['listingInfo.startTime','listingInfo.endTime','similarity_score','model_series'], axis=1, inplace=True)
df_classification.drop(labels=['similarity_score'], axis=1, inplace=True)

In [187]:
df_classification.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,10x zoom,30mp,display black,zoom 20,sx200,grip lenses,600d 18mp,eu,vario asph,focusing screen,135mm,camera near,ixus 850,has_lens,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,one_day_shipping,shipping_cost,shippingInfo.shippingType,handling_time,returns_accepted,startPrice,sold_state
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,91.7,0,1,0,0.0,0,2.0,1,399.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,100.0,0,1,0,0.0,0,3.0,0,300.0,1


In [141]:
col_list = df_classification.columns.tolist()[:-4] + df_classification.columns.tolist()[-3:] + [df_classification.columns.tolist()[-4]]

df_classification = df_classification[col_list]

Save df_classification to a pickle

In [115]:
df_classification.to_pickle('./pickles/df_classification_titles_tfidf.p')

## Import Data
---

**Import classification data**

In [116]:
# df_classification = pd.read_pickle('./pickles/df_classification_titles_tfidf.p')
# df_classification = pd.read_pickle('./pickles/df_classification_tfidf.p')
df_classification = pd.read_pickle('./pickles/df_classification_count_vec.p')


In [176]:
df_classification.head(2)

NameError: name 'df_classification' is not defined

**import regression data**

In [86]:
df_regression_all = pd.read_pickle('./pickles/df_regression_tfidf_all.p')
df_regression_all.head(2)

In [3]:
df_regression = pd.read_pickle('./pickles/df_regression_tfidf.p')
df_regression.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,concurrent_similar_median_start_price,sold_state,endPrice
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,92.5,1,369.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,99.8,1,0,17.99,1.0,0,1,349.99,92.5,1,385.0


## Train Test Split
---

In [160]:
X_class = df_classification.ix[:, :-1].values
y_class = df_classification.ix[:, -1].values
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, random_state=42, test_size=0.2)

In [161]:
del df_classification

In [5]:
X_reg = df_regression.ix[:, :-2].values # skip sold_state
y_reg = df_regression.ix[:, -1].values
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, random_state=42, test_size=0.2) 

In [71]:
del df_regression

## Modeling
---

## Classification

### Random Forest

In [168]:
# rfc = RandomForestClassifier(n_estimators=500, min_samples_split=6, max_depth=10, min_samples_leaf=5, n_jobs=-1, verbose=1)
# rfc = RandomForestClassifier(n_estimators=100, min_samples_split=6, max_depth=10, min_samples_leaf=5, n_jobs=-1, verbose=1)
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
rfc.fit(X_train_class, y_train_class)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=1, warm_start=False)

In [169]:
y_pred = rfc.predict(X_test_class)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished


In [170]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Model accuracy:',accuracy_score(y_test_class, y_pred)

Baseline Accuracy: 0.84756302521
Model accuracy: 0.903025210084


Random Forest Cross Val

In [175]:
rfc_scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=100), X=X_class, y=y_class, cv=3, scoring='accuracy', n_jobs=-1)
print np.mean(rfc_scores)

0.844867686184


Random Forest Feature importances

In [150]:
# sorted(zip(df_classification.columns.tolist(),list(rfc.feature_importances_)), key=lambda x: x[1], reverse=True)

[('startPrice', 0.04109643523094314),
 (u'255', 0.022977187327735439),
 (u'500 hs', 0.022415494893030723),
 (u'zs6 12', 0.020884291560096455),
 (u'eos dslr', 0.019905514062086224),
 (u'p5000', 0.019032502074474118),
 ('model_median_end_price', 0.018114107741566771),
 (u'canon 50d', 0.01504646933354899),
 (u'50mm prime', 0.014219819140401758),
 (u'n2', 0.013872844949497172),
 (u'2mp', 0.013770410656372712),
 ('model_mean_end_price', 0.01184071026794037),
 (u'275 hs', 0.010365863880547499),
 (u'repair', 0.0097837442560321994),
 (u'hot sale', 0.0097706525416459463),
 (u'vr kit', 0.0095407462101508352),
 (u'132 16', 0.0086453248435742584),
 (u'da', 0.0079478362136720954),
 (u'low actuations', 0.0079463471629907939),
 (u'cards', 0.0074091816597863912),
 (u'canon 5d', 0.007285281339188464),
 ('model_max_end_price', 0.007256819219492578),
 (u'55mm 28', 0.006075852448048048),
 (u'fz70', 0.0060112394498459528),
 (u'p600 16', 0.005954556427213791),
 (u'fz7', 0.0058738741274053366),
 (u'gf2 body'

### Logistic Regression

In [171]:
lrc = LogisticRegression()
lrc.fit(X_train_class, y_train_class)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [172]:
y_pred = lrc.predict(X_test_class)

In [173]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Model accuracy:',accuracy_score(y_test_class, y_pred)

Baseline Accuracy: 0.84756302521
Model accuracy: 0.874621848739


## Regression

In [None]:
# rfr = RandomForestRegressor(n_estimators=10, n_jobs=-1)

# rfr.fit(X_train_reg, y_train_reg)

In [7]:
knn = KNeighborsRegressor(n_neighbors=5, weights='distance')

knn.fit(X_train_reg, y_train_reg)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='distance')

In [None]:
# rfr.predict(X_test_reg)
print 'Baseline  Absolute Error:'
print 'Baseline  Absolute Error:'

**Grab a subsample of 2000 rows**

In [8]:
df_test = df_regression.sample(n=2000) # sub sample with all sold_state

In [163]:
df_test.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,concurrent_similar_median_start_price,sold_state,endPrice
8837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,100.0,0,1,10.0,2.0,0,0,240.0,92.5,1,290.0
779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,100.0,0,1,0.0,3.0,0,0,70.0,92.5,1,122.5


In [51]:
df_test['endPrice'].mean()

262.72396000000003

In [53]:
df_regression['endPrice'].sum()

6719374.1499999994

## Calculate profitability metric
---

In [9]:
prices = []
sold_probabilities = []
predicted_end_prices = []

for i in range(df_test.shape[0]): # actual 
# for i in range(3): # test

    if i % 100 == 0:
        print 'Processing {} out of {} rows.'.format(i, df_test.shape[0])
        
    # Grab vector
    item_vec = df_test.iloc[i]
    testStartPrice = item_vec.loc['startPrice']
    testEndPrice = item_vec.loc['endPrice']
    testSoldState = item_vec.loc['sold_state']
    
    # Set up parameters
    minStartPrice = 0
    maxStartPrice = testStartPrice * 2
    priceStepSize = testStartPrice / 5 
    
    max_expected_price = (0,0,0,0,0)
    # Loop through all start prices 
    for price_index,price in enumerate(np.arange(minStartPrice, maxStartPrice, priceStepSize)):
        # Grab test start price 
        item_vec.loc['startPrice'] = price
        
        # Classification        
        test_vec = item_vec.iloc[:-3].values.reshape(1,-1) # :-3 don't include CSM_start_price, sold_state, end_price
        sold_proba = rfc.predict_proba(test_vec)[0][1]
        sold_probabilities.append( (price, sold_proba) )
        
        # Regression
        test_vec = item_vec.iloc[:-2].values.reshape(1,-1) # don't include sold_state, end_price
        predicted_end_price = knn.predict(test_vec)[0]
        predicted_end_prices.append( (price, predicted_end_price) )
        
        # Find Optimal End Price
        if sold_proba > 0.5:
            expected_price = sold_probabilities[price_index][1] * predicted_end_prices[price_index][1] # Sold Probability * Predicted End Price
            if expected_price > max_expected_price[len(max_expected_price)-3]:
                max_expected_price = (price_index,\
                                      price,\
                                      sold_probabilities[price_index][1],\
                                      predicted_end_prices[price_index][1],\
                                      expected_price,\
                                      testSoldState,\
                                      testEndPrice)
    
    # If after looping through all potential start prices, there is no sold_proa > 0.5, then 
    if max_expected_price == (0,0,0,0,0):
        max_expected_price = (0, 0, 0, 0, 0, testSoldState, testEndPrice)
    
    prices.append(max_expected_price)

Processing 0 out of 2000 rows.
Processing 100 out of 2000 rows.
Processing 200 out of 2000 rows.
Processing 300 out of 2000 rows.
Processing 400 out of 2000 rows.
Processing 500 out of 2000 rows.
Processing 600 out of 2000 rows.
Processing 700 out of 2000 rows.
Processing 800 out of 2000 rows.
Processing 900 out of 2000 rows.
Processing 1000 out of 2000 rows.
Processing 1100 out of 2000 rows.
Processing 1200 out of 2000 rows.
Processing 1300 out of 2000 rows.
Processing 1400 out of 2000 rows.
Processing 1500 out of 2000 rows.
Processing 1600 out of 2000 rows.
Processing 1700 out of 2000 rows.
Processing 1800 out of 2000 rows.
Processing 1900 out of 2000 rows.


In [None]:
# print 'Optimal Predicted End Price:${}, \
#         Optimal Start Price:${}, \
#         Chance of Selling:{}, \
#         Expected Profit:${}'.format(max_expected_price[3],\
#                                     max_expected_price[1],\
#                                     max_expected_price[2],\
#                                     max_expected_price[4])

Use prices list of tuples to create average_profit_lift 

In [10]:
prices

[(8,
  240.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  203.0),
 (8,
  1.6000000000000001,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  235.0),
 (8,
  41.583999999999996,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  25.989999999999998),
 (8,
  15.984,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  25.579999999999998),
 (7,
  58.282000000000004,
  0.93000000000000005,
  270.83132826024348,
  251.87313528202645,
  1.0,
  50.0),
 (8,
  47.983999999999995,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  195.5),
 (8,
  0.016,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  343.99000000000001),
 (8,
  112.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  256.58999999999997),
 (8,
  640.0,
  0.93000000000000005,
  323.13379749759946,
  300.5144316727675,
  1.0,
  475.0),
 (8,
  40.0,
  0.93000

In [11]:
# for price_index,price,sold_proba,pred_end_price,expected_price,testSoldState,testEndPrice in prices:
#     if sold_proba >= 0.5 and testSoldState == 1:
#         profit_diff = expected_price - testEndPrice
#     elif sold_proba >= 0.5 and testSoldState == 0:
#         profit_diff = expected_price - testEndPrice
#     elif sold_proba < 0.5 and testSoldState == 1:
#         profit_diff = expected_price - testEndPrice

profit_diff = [expected_price - testEndPrice for price_index,\
                                                   price,\
                                                   sold_proba,\
                                                    pred_end_price,\
                                                    expected_price,\
                                                   testSoldState,\
                                                   testEndPrice in prices]

average_profit_lift = np.mean(profit_diff)
    

In [12]:
average_profit_lift

36.419219057754646

## Grid Search

In [174]:
rfc_param_grid = {
    'n_estimators':[250,500],
    'min_samples_split':[2, 4, 6],
#     'max_features':['auto',0.3,0.5],
    'min_samples_leaf':[1,3,5],
    'max_depth':[4, 8]
}

gs = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rfc_param_grid, scoring='accuracy',n_jobs=-1,verbose=100)

gs.fit(X_class, y_class)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Memmaping (shape=(29749, 7737), dtype=float64) to new file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_83067_4757259600/83067-7787287504-d9d8a3f5d08b0bbe7363ff96b3929da7.pkl
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19832,), dtype=int64).
Pickling array (shape=(9917,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(29749, 7737), dtype=float64) to old file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_83067_4757259600/83067-7787287504-d9d8a3f5d08b0bbe7363ff96b3929da7.pkl
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19833,), dtype=int64).
Pickling array (shape=(9916,), dtype=int64).
[CV] min_samples_split=2, n_estimators=250, max_depth=4, min_samples_leaf=1 
Memmaping (shape=(29749, 7737),

KeyboardInterrupt: 

In [None]:
print 'Best Estimator:',gs.best_estimator_
print 'Best Score:',gs.best_score_
print 'Best Parameters:',gs.best_params_

In [None]:
joblib.dump(gs.best_estimator_, './model_pickles/gs_random_forest.pkl') 

### Logistic Regression

### KNN

### SVM

** Model **

In [63]:
print 'top 5 features:'
feature_importances = sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)
sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)[:5]

top 5 features:


[('startPrice', 0.091041724275354508),
 ('concurrent_similar_median_start_price', 0.072779250151591424),
 (u'scratches body', 0.0065584151677305557),
 (u'50mm prime', 0.0048426325250845821),
 (u'255', 0.0041375394373774389)]

# Score Model

Baseline 

In [71]:
from sklearn.metrics import accuracy_score
# calculate baseline
y_true = y_train_class
y_pred = np.ones(y_true.shape[0])
print 'Baseline accuracy:',accuracy_score(y_true, y_pred)

Baseline accuracy: 0.853763351135


Cross val score

In [69]:
# scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), X=X_class, y=y_class, scoring='accuracy',cv=3, n_jobs=-1)
# print np.mean(scores)