In [72]:
from __future__ import division
import pandas as pd
import datetime
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import ShuffleSplit, learning_curve
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
from sklearn.externals import joblib

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text


pd.set_option('display.max_columns', 60)

### Import data

In [28]:
auctions = pd.read_pickle('./pickles/auctions.p')

In [29]:
concurrent_similar_median_start_price_series = pd.read_pickle('./feature-engineering-concurrent-similar-median-start-price/pickles/auctions_median_start_price_series.p')
concurrent_similar_median_start_price_series.name = 'concurrent_similar_median_start_price'

In [30]:
# Count NaN values
print 'Total Values:',concurrent_similar_median_start_price_series.shape[0]
print 'Non NaN Values:',concurrent_similar_median_start_price_series.count()
print 'NaN Values:',concurrent_similar_median_start_price_series.shape[0] - concurrent_similar_median_start_price_series.count()

Total Values: 29961
Non NaN Values: 22376
NaN Values: 7585


Impute missing values

In [31]:
# Replace missing concurrent median start prices with the median overall start price
concurrent_similar_median_start_price_series[concurrent_similar_median_start_price_series.isnull()] = np.median(auctions['startPrice'])

### Extract Features

In [32]:
# Use startPrice, title (vectorized), conditionDisplayName (one-hot-encoded)
auction_condition_dummies = pd.get_dummies(data=auctions['condition.conditionDisplayName'])

start_price_series = auctions['startPrice']

titles = auctions['title']

sold = auctions['sold_state']

condition_combined = auctions['conditionCombined']

end_price = auctions['endPrice']

start_time_series = auctions['listingInfo.startTime']

end_time_series = auctions['listingInfo.endTime']

feedback_percent_series = pd.Series(auctions['sellerInfo.positiveFeedbackPercent'], name='feedback_percent')

top_rated_seller_series = pd.Series(auctions['sellerInfo.topRatedSeller'].apply(lambda x: 1 if x==True else 0), name='top_rated_seller')

expedited_shipping_series = pd.Series(auctions['shippingInfo.expeditedShipping'], name='expedited_shipping')

shipping_cost_series = pd.Series(auctions['shippingInfo.shippingServiceCost.value'], name='shipping_cost')

handling_time_series = pd.Series(auctions['shippingInfo.handlingTime'], name='handling_time')

one_day_shipping_series = pd.Series(auctions['shippingInfo.oneDayShippingAvailable'], name='one_day_shipping')

one_day_shipping_series = pd.Series(auctions['shippingInfo.oneDayShippingAvailable'], name='one_day_shipping')

returns_accepted_series = pd.Series(auctions['returnsAccepted'], name='returns_accepted')



Clean text

In [33]:
clean_titles = []
for i,title in enumerate(titles.values):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_titles.append(clean_text(title))

clean_conditions = []
for i,cond in enumerate(condition_combined):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_conditions.append(clean_text(cond))

cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents
cleaning #25000 out of 29961 documents
cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents


  'Beautiful Soup.' % markup)


cleaning #25000 out of 29961 documents


In [34]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),
                             min_df=5,
                             analyzer='word',
                             stop_words=None,
                             max_features=10000,
                            )

titles_matrix = vectorizer.fit_transform(clean_titles)

import operator
# print sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:5]

titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [35]:
vectorizer = CountVectorizer(ngram_range = (1,2),
                             min_df=30,
                             analyzer='word',
                             stop_words=None,
                             max_features=5000,
                            )

conditions_matrix = vectorizer.fit_transform(clean_conditions)

import operator
# print 'most common condition grams:',sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)[:5]
# print 'least common condition grams:',sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1), reverse=False)[:5]

conditions_df = pd.DataFrame(conditions_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [36]:
del clean_titles
del clean_conditions
del conditions_matrix
del titles_matrix

## Create Processed DataFrame

**Create processed feature dataframe**

In [37]:
data_frames_to_keep = [titles_df,\
                       conditions_df,\
                       auction_condition_dummies,\
                       feedback_percent_series,\
                       top_rated_seller_series,\
                       expedited_shipping_series,\
                       shipping_cost_series,\
                       handling_time_series,\
                       one_day_shipping_series,\
                       returns_accepted_series,\
                       start_price_series,\
                       sold]

df_classification = pd.concat(data_frames_to_keep, axis=1)

In [39]:
# Impute missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='most_frequent', axis=0)
df_classification['shipping_cost'] = imputer.fit_transform(df_classification['shipping_cost'].values.reshape(-1,1))

In [58]:
df_classification['handling_time'] = imputer.fit_transform(df_classification['handling_time'].values.reshape(-1,1))

In [60]:
print 'Null Rows %:',np.sum(df_classification.isnull().sum())/df_classification.shape[0]

Null Rows %: 0.0


In [38]:
for dataframe in data_frames_to_keep:
    del dataframe

In [33]:
df_classification.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,bundle,photos hassle,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,sold_state
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,91.7,0,1,0.0,2.0,0,1,399.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,1


Save df_classification to a pickle

In [61]:
df_classification.to_pickle('./pickles/df_classification_tfidf.p')

# Import Data

Import classification data

In [62]:
df_classification = pd.read_pickle('./pickles/df_classification_tfidf.p')

In [63]:
df_classification.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,bundle,photos hassle,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,sold_state
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,91.7,0,1,0.0,2.0,0,1,399.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,1


import regression data

In [86]:
df_regression_all = pd.read_pickle('./pickles/df_regression_tfidf_all.p')

In [87]:
df_regression_all.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,concurrent_similar_median_start_price,sold_state,endPrice
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,91.7,0,1,0.0,2.0,0,1,399.0,92.5,0,399.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,92.5,1,369.0


In [130]:
df_regression = pd.read_pickle('./pickles/df_regression_tfidf.p')
df_regression.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,concurrent_similar_median_start_price,sold_state,endPrice
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,100.0,0,1,0.0,3.0,0,0,300.0,92.5,1,369.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,99.8,1,0,17.99,1.0,0,1,349.99,92.5,1,385.0


## Train Test Split

In [140]:
X_class = df_classification.ix[:, :-1].values
y_class = df_classification.ix[:, -1].values
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, random_state=42, test_size=0.2)

NameError: name 'df_classification' is not defined

In [147]:
X_reg = df_regression.ix[:, :-2].values # skip sold_state
y_reg = df_regression.ix[:, -1].values
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, random_state=42, test_size=0.2) 

In [71]:
del df_classification
del df_regression

## Learning Curve 

In [9]:
from sklearn.model_selection import learning_curve

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

estimator = RandomForestClassifier(n_estimators=50)

title = "Learning Curves (Random Forest Classifier)"

train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=X_class, y=y_class, scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")

## Profitability Metric

### Random Forest

In [169]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)

rfc.fit(X_train_class, y_train_class)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [118]:
print 'Baseline Accuracy:',accuracy_score(y_test_class, np.ones(y_test_class.shape[0]))
print 'Model accuracy:',accuracy_score(y_test_class, rfc.predict(X_test_class))

Baseline Accuracy: 0.857667278492
Model accuracy: 0.902719839813


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [None]:
# rfr = RandomForestRegressor(n_estimators=10, n_jobs=-1)

# rfr.fit(X_train_reg, y_train_reg)

In [148]:
knn = KNeighborsRegressor(n_neighbors=5, weights='distance')

knn.fit(X_train_reg, y_train_reg)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='distance')

In [None]:
plt.hist(y_train_reg, bins=50)

In [None]:
# rfr.predict(X_test_reg)
print 'Baseline  Absolute Error:'
print 'Baseline  Absolute Error:'

**Grab a subsample of 2000 rows**

In [131]:
df_test = df_regression.sample(n=2000) # sub sample with all sold_state

In [149]:
X_train_reg.shape

(20482, 7738)

In [150]:
df_test.iloc[0,:-2].shape

(7738,)

In [163]:
df_test.head(2)

Unnamed: 0,z990 12,excellent cond,lumix digital,grip charger,medium format,270,275,kodak,af nikon,body read,cmos smart,package deal,card case,lens free,batteries strap,e6,slt a33,slt a35,zoom lense,fe fe,elph 170,fujifilm pro1,tz25,body working,12 megapixel,titanium,zoom mega,fuji fine,dsc f707,travel,...,scratches dents,cleaned,condition low,please read,shutter actuations,store returns,dent,operational,know,repair,warranty,ex,little use,For parts or not working,Manufacturer refurbished,New,New other (see details),Seller refurbished,Used,feedback_percent,top_rated_seller,expedited_shipping,shipping_cost,handling_time,one_day_shipping,returns_accepted,startPrice,concurrent_similar_median_start_price,sold_state,endPrice
8837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,100.0,0,1,10.0,2.0,0,0,240.0,92.5,1,290.0
779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,100.0,0,1,0.0,3.0,0,0,70.0,92.5,1,122.5


**Calculate profitability metric **

In [165]:
prices = []
sold_probabilities = []
predicted_end_prices = []
# for i in range(df_test.shape[0]): # actual 
for i in range(3): # test
    
    # Grab vector
    item_vec = df_test.iloc[i]
    testStartPrice = item_vec.loc['startPrice']
    testEndPrice = item_vec.loc['endPrice']
    testSoldState = item_vec.loc['sold_state']
    
    # Set up parameters
    minStartPrice = 0
    maxStartPrice = testStartPrice * 2
    priceStepSize = testStartPrice / 5 
    
    max_expected_price = (0,0,0,0,0)
    # Loop through all start prices 
    for price_index,price in enumerate(np.arange(minStartPrice, maxStartPrice, priceStepSize)):
        # Grab test start price 
        item_vec.loc['startPrice'] = price
        
        # Classification        
        test_vec = item_vec.iloc[:-3].values.reshape(1,-1) # :-3 don't include CSM_start_price, sold_state, end_price
        sold_proba = rfc.predict_proba(test_vec)[0][1]
        sold_probabilities.append( (price, sold_proba) )
        
        # Regression
        test_vec = item_vec.iloc[:-2].values.reshape(1,-1) # don't include sold_state, end_price
        predicted_end_price = knn.predict(test_vec)[0]
        predicted_end_prices.append( (price, predicted_end_price) )
        
        # Find Optimal End Price
        if sold_proba > 0.5:
            expected_price = sold_probabilities[price_index][1] * predicted_end_prices[price_index][1] # Sold Probability * Predicted End Price
            if expected_price > max_expected_price[len(max_expected_price)-3]:
                max_expected_price = (price_index,\
                                      price,\
                                      sold_probabilities[price_index][1],\
                                      predicted_end_prices[price_index][1],\
                                      expected_price,\
                                      testSoldState,\
                                      testEndPrice)
    
    # If after looping through all potential start prices, there is no sold_proa > 0.5, then 
    if max_expected_price == (0,0,0,0,0):
        max_expected_price = (0, 0, 0, 0, 0, testSoldState, testEndPrice)
    
    prices.append(max_expected_price)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]

In [None]:
# print 'Optimal Predicted End Price:${}, \
#         Optimal Start Price:${}, \
#         Chance of Selling:{}, \
#         Expected Profit:${}'.format(max_expected_price[3],\
#                                     max_expected_price[1],\
#                                     max_expected_price[2],\
#                                     max_expected_price[4])

Use prices list of tuples to create average_profit_lift 

In [166]:
prices

[(9,
  432.0,
  0.69999999999999996,
  507.55711599617786,
  355.28998119732449,
  1.0,
  290.0),
 (9,
  126.0,
  0.69999999999999996,
  507.55711599617786,
  355.28998119732449,
  1.0,
  122.5),
 (9,
  38.699999999999996,
  0.69999999999999996,
  507.55711599617786,
  355.28998119732449,
  1.0,
  21.5)]

## Grid Search

In [14]:
RandomForestClassifier()

rfc_param_grid = {
    'n_estimators':[100],
    'min_samples_split':[2, 5, 10],
    'max_features':['auto',0.3,0.5],
    'min_samples_leaf':[1,3,5]
}

gs = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rfc_param_grid, scoring='recall',n_jobs=-1,verbose=100)

gs.fit(X_class, y_class)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Memmaping (shape=(29749, 7737), dtype=float64) to new file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_11173_4564184464/11173-4564163024-0783137f3d0ee86f4888b0659d6d8145.pkl
[CV] max_features=auto, min_samples_split=2, n_estimators=100, min_samples_leaf=1 
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19832,), dtype=int64).
Pickling array (shape=(9917,), dtype=int64).
[CV] max_features=auto, min_samples_split=2, n_estimators=100, min_samples_leaf=1 
Memmaping (shape=(29749, 7737), dtype=float64) to old file /var/folders/90/m6j358k94lxbhz0y6yl5rd640000gp/T/joblib_memmaping_pool_11173_4564184464/11173-4564163024-0783137f3d0ee86f4888b0659d6d8145.pkl
Pickling array (shape=(29749,), dtype=int64).
Pickling array (shape=(19833,), dtype=int64).
Pickling array (shape=(9916,), dtype=int64).
[CV] max_features=auto, min_samples_split=2, n_estimators=100, min_samples_leaf=1 
Memmaping (sha

KeyboardInterrupt: 

In [None]:
print 'Best Estimator:',gs.best_estimator_
print 'Best Score:',gs.best_score_
print 'Best Parameters:',gs.best_params_

In [None]:
joblib.dump(gs.best_estimator_, './model_pickles/gs_random_forest.pkl') 

### Logistic Regression

### KNN

### SVM

** Model **

In [76]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)

rfc.fit(X_train_class, y_train_class)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [63]:
print 'top 5 features:'
feature_importances = sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)
sorted(zip(df_classification.columns.tolist()[:-1], rfc.feature_importances_), key=lambda x: x[1], reverse=True)[:5]

top 5 features:


[('startPrice', 0.091041724275354508),
 ('concurrent_similar_median_start_price', 0.072779250151591424),
 (u'scratches body', 0.0065584151677305557),
 (u'50mm prime', 0.0048426325250845821),
 (u'255', 0.0041375394373774389)]

# Score Model

Baseline 

In [71]:
from sklearn.metrics import accuracy_score
# calculate baseline
y_true = y_train_class
y_pred = np.ones(y_true.shape[0])
print 'Baseline accuracy:',accuracy_score(y_true, y_pred)

Baseline accuracy: 0.853763351135


Cross val score

In [69]:
# scores = cross_val_score(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1), X=X_class, y=y_class, scoring='accuracy',cv=3, n_jobs=-1)
# print np.mean(scores)