In [1]:
from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 60)

Helper Fuctions

In [2]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

def clean_text(doc, remove_stop_words=True, remove_digits=False, remove_punc=True, stem=False):
    
    # 1. Remove any HTML markup
    text = BeautifulSoup(doc).get_text()  
    
    # 2. Extract special negator like n't
    text = re.sub('n\'t', ' not', text)
    
    # 3. remove punctuation(except .-)
    if remove_punc:
        text = re.sub('[^a-zA-Z.\-\d]', ' ', text)
        
    if remove_digits:
        text = re.sub('[.\d]', ' ', text)
        
    # 4. Convert to lower case 
    text = text.lower()
        
    # 5. Remove stop words
    if remove_stop_words:
        stops = set(stopwords.words("english"))
        text = [w for w in text.split(' ') if not w in stops]
        text = ' '.join(text)
                
    # 6. apply Porter Stemming
    # probably don't need this
    if stem:
        stemmer = PorterStemmer()
        stemmer = LancasterStemmer()
        text = [stemmer.stem(w) for w in text.split(' ')]
        text = ' '.join(text)
        
    # 7. Remove extra white space
    text = re.sub(' +',' ', text)
        
    return text

**Import data**

In [4]:
auctions = pd.read_pickle('../pickles/auctions.p')

Select features from which similarity will be calcualted  

In [5]:
# Select Features from which similarity will be calcualted 
title_series = auctions['title']
condition_id_series = auctions['condition.conditionId']

Take start/end times and start price features

In [6]:
# Necessary Features
start_price_series = auctions['startPrice']
start_time_series = auctions['listingInfo.startTime']
end_time_series = auctions['listingInfo.endTime']

Clean Text Features 

In [7]:
clean_titles = []
for i,title in enumerate(title_series.values):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(title_series))
    clean_titles.append(clean_text(title))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents
cleaning #25000 out of 29961 documents


In [8]:
clean_conditions = []
for i,cond in enumerate(condition_combined):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_conditions.append(clean_text(cond))

NameError: name 'condition_combined' is not defined

**Vectorize text features**

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),
                             min_df=10,
                             analyzer='word',
                             stop_words=None,
                             max_features=10000,
                            )

titles_matrix = vectorizer.fit_transform(clean_titles)
titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2),
                             min_df=30,
                             analyzer='word',
                             stop_words=None,
                             max_features=5000,
                            )

conditions_matrix = vectorizer.fit_transform(clean_conditions)
conditions_df = pd.DataFrame(conditions_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [None]:
del clean_titles
del clean_conditions
del titles_matrix
del conditions_matrix

**Create new dataframe from pre-processed features**

In [None]:
# data_frames_to_keep = [titles_df, conditions_df, start_time_series, end_time_series, start_price_series]
data_frames_to_keep = [titles_df, condition_id_series, start_time_series, end_time_series, start_price_series]
auctions = pd.concat(data_frames_to_keep, axis=1)

In [None]:
# del title_series
# del condition_series
# del start_price_series
# del start_time_series
# del end_time_series

**Preprocessing imported dataframe**

In [None]:
auctions['medianConcurrentStartPrice'] = np.nan
auctions['condition.conditionId'] = auctions['condition.conditionId']/1000

In [9]:
# Sort dataframe by startTime DESC
auctions.sort_values(by='listingInfo.startTime', inplace=True)

In [10]:
auctions.reset_index(inplace=True)

In [28]:
# auctions.to_pickle('./pickles/pre_processed_auctions.p')

In [13]:
auctions = pd.read_pickle('./pickles/pre_processed_auctions.p')

In [12]:
auctions_original = pd.read_pickle('../pickles/auctions.p')

# Development 

**Find Concurrent Listings**

In [15]:
# Set threshold for similarity, as well as minimum number of similar items
top_n_items = 5
min_sim_score = 0.95
threshold_limit = 0.95

step_size = -0.01

In [40]:
# Sample listing 
sample_index = 5000
current_listing = auctions.iloc[sample_index] 
current_listing_index = auctions[auctions['index']==current_listing['index']].index[0]
print current_listing_index
listing_st = auctions.loc[sample_index,'listingInfo.startTime']

5000


In [41]:
# subset dataframe
auctions_subset = auctions.iloc[:current_listing_index]

In [42]:
print auctions_subset.shape

(5000, 3899)


In [43]:
# Find concurrent listings
concurrent_listings_df = auctions_subset[auctions_subset.apply(lambda x: listing_st<x['listingInfo.endTime'], axis=1)]

In [44]:
print concurrent_listings_df.shape

(2283, 3899)


In [45]:
# current_listing.iloc[1:-4]

In [46]:
# concurrent_listings_df.iloc[:, 1:-4]

In [47]:
# Calculate the similarity between current listing 
current_listing_vec = current_listing.iloc[1:-4].values.reshape(1,-1)
concurrent_listings_matrix = concurrent_listings_df.iloc[:, 1:-4].values

In [48]:
print current_listing_vec

[[0.0 0.0 0.0 ..., 0.0 0.0 3.0]]


In [49]:
# calculate similarity scores 
cos_sim_matrix = cosine_similarity(current_listing_vec, concurrent_listings_matrix)

In [50]:
print cos_sim_matrix.shape

(1, 2283)


In [51]:
# Insert similarity score 
concurrent_listings_df.insert(loc=concurrent_listings_df.shape[1]-1, column='similarity_score', value=cos_sim_matrix.reshape(-1,1))

In [52]:
print concurrent_listings_df.shape

(2283, 3900)


In [53]:
# del concurrent_similar_listings_df

### Similarity Score Filter V2

In [54]:
concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>min_sim_score]
# concurrent_similar_listings_df = concurrent_similar_listings_df\
#                                 .sort_values('similarity_score', ascending=False)\
#                                 .head(top_n_items)
concurrent_similar_listings_df = concurrent_similar_listings_df.sort_values(by='similarity_score',ascending=False).head(top_n_items)
# print concurrent_similar_listings_df['similarity_score']        

# print concurrent_similar_listings_df.shape


Validation

In [55]:
print 'original listing:'
print auctions_original.loc[current_listing['index'],['title','startPrice']]

original listing:
title         Nikon D5100 16.2 MP Kit w/ AF-S VR 18-55mm Len...
startPrice                                                  199
Name: 23521, dtype: object


In [56]:
concurrent_similar_listings_df['startPrice']

4517    200.00
4960      0.99
1772    100.00
3060    150.00
Name: startPrice, dtype: float64

In [57]:
print 'Similar Concurrent Listing MEDIAN Start Price:',np.median(concurrent_similar_listings_df['startPrice'])

Similar Concurrent Listing MEDIAN Start Price: 125.0


In [58]:
print 'Similar Concurrent Listing MEAN Start Price:',np.mean(concurrent_similar_listings_df['startPrice'])

Similar Concurrent Listing MEAN Start Price: 112.7475


In [35]:
# print 'concurrent similar listings indeces:',concurrent_similar_listings_df['index']
print 'FILTERED ITEMS\n\n'
for i,index in enumerate(concurrent_similar_listings_df['index']):
    print 'item #{} out of {}'.format(i+1,len(concurrent_similar_listings_df.index))
    print 'similarity score:',concurrent_similar_listings_df['similarity_score'].iloc[i]
    print auctions_original.loc[index, ['title','condition.conditionId','startPrice']], '\n'

FILTERED ITEMS


item #1 out of 2
similarity score: 0.972562318423
title                    Nikon D3200 24.2MP DSLR Camera (VR 18-55mm + 5...
condition.conditionId                                                 3000
startPrice                                                             300
Name: 9235, dtype: object 

item #2 out of 2
similarity score: 0.950597369266
title                    Nikon D D3200 24.2 MP Digital SLR Camera - Bla...
condition.conditionId                                                 7000
startPrice                                                             149
Name: 3841, dtype: object 



In [54]:
print 'ALL CONCURRENT SIMILAR ITEMS\n\n'
concurrent_similar_listings_df = concurrent_listings_df\
                                .sort_values('similarity_score', ascending=False)\
                                .head(top_n_items)
for i,index in enumerate(concurrent_similar_listings_df['index']):
    print 'item #{} out of {}'.format(i+1,len(concurrent_similar_listings_df.index))
    print 'similarity score:',concurrent_similar_listings_df['similarity_score'].iloc[i]
    print auctions_original.loc[index, ['title','condition.conditionId','startPrice']], '\n'

ALL CONCURRENT SIMILAR ITEMS


item #1 out of 5
similarity score: 0.972562318423
title                    Nikon D3200 24.2MP DSLR Camera (VR 18-55mm + 5...
condition.conditionId                                                 3000
startPrice                                                             300
Name: 9235, dtype: object 

item #2 out of 5
similarity score: 0.950597369266
title                    Nikon D D3200 24.2 MP Digital SLR Camera - Bla...
condition.conditionId                                                 7000
startPrice                                                             149
Name: 3841, dtype: object 

item #3 out of 5
similarity score: 0.9490231746
title                    Nikon D3200 24.2 MP Digital SLR Camera, black ...
condition.conditionId                                                 7000
startPrice                                                           49.99
Name: 982, dtype: object 

item #4 out of 5
similarity score: 0.946964684569
title        

### Similarity Score filter V1

In [85]:
# Find top n most similar items
maxSimScore = max(concurrent_listings_df['similarity_score'])
for i,threshold in enumerate(np.arange(1, threshold_limit, step_size)):
    # Filter for similar items
    numSimilarListings = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold].shape[0]
    
    print 'threshold:',threshold
    print 'numSimilarListings:',numSimilarListings
    
    if numSimilarListings >= top_n_items:  
        concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold]\
                                        .sort_values(by='similarity_score', ascending=False)\
                                        .head(top_n_items) # take n most similar items
        medianStartPrice = np.median(concurrent_similar_listings_df['startPrice'])
        break

if numSimilarListings < top_n_items:
    print 'not enough similar concurrent listings.'
    concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold]\
                                .sort_values(by='similarity_score', ascending=False)\
                                .head(top_n_items) # take n most similar items
    medianStartPrice = np.median(concurrent_similar_listings_df['startPrice'])         
    
print medianStartPrice

threshold: 1.0
numSimilarListings: 0
threshold: 0.99
numSimilarListings: 90
25.0


Validation

In [163]:
print 'threshold:',threshold
print 'numSimilarListings:',numSimilarListings

threshold: 0.99
numSimilarListings: 90


In [164]:
print 'original listing:'
print auctions_original.loc[current_listing['index'],['title','startPrice']]

original listing:
title         Samsung WB350F 16.2MP CMOS Smart WiFi & NFC Di...
startPrice                                                    5
Name: 29904, dtype: object


In [165]:
# print 'concurrent similar listings indeces:',concurrent_similar_listings_df['index']
for i,index in enumerate(concurrent_similar_listings_df['index']):
    print 'item #{} out of {}'.format(i+1,len(concurrent_similar_listings_df.index))
    print 'similarity score:',concurrent_similar_listings_df['similarity_score'].iloc[i]
    print auctions_original.loc[index, ['title','condition.conditionId','startPrice']], '\n'

In [166]:
# Set median to dataframe
# auctions.loc[sample_index,'medianConcurrentStartPrice'] = medianStartPrice

# Completed Function

In [59]:
def filter_fn(listing, avg_type='median'):
    top_n_items = 5
    min_sim_score = 0.95 

    current_listing = listing
    listing_st = current_listing['listingInfo.startTime']
    current_listing_index = auctions[auctions['index']==current_listing['index']].index[0]
    if current_listing_index == 0:
        return np.nan
    
    if current_listing_index % 100 == 0:
        print 'Calculating concurrent similar median start price for #{} listing out of {}'.format(current_listing_index,auctions_original.shape[0])
    
    # subset dataframe to look at only items that have a startTime < current listing start time
    # NOTE!!!! This is only halfway efficient! for every listing, we need to scan the entire subset, which can get pretty large
    # the max(0, curr - 4000) is just a cheap way of making this more efficient, 
    # 4000 because that should ensure we're only looking at auctions which have a end time BEFORE the current listing start time
    auctions_subset = auctions.iloc[max(0,current_listing_index-3500):current_listing_index] # this works because auctions is sorted by startTime with the first item as the least recent 
    
    # Find concurrent listings - 40 ms
    concurrent_listings_df = auctions_subset[auctions_subset['listingInfo.endTime']\
                                             .apply(lambda sub_listing_et: listing_st<sub_listing_et)]
    
    
    # Calculate cosine similarity between original listing and concurrent listings - 70 ms
    cos_sim_matrix = cosine_similarity(current_listing.iloc[1:-4].values.reshape(1,-1), concurrent_listings_df.iloc[:, 1:-4])
    
    # Insert similarity score so we can filter for listings above a certain similarity score thresold - 1 ms
    concurrent_listings_df.insert(loc=concurrent_listings_df.shape[1]-1, column='similarity_score', value=cos_sim_matrix.reshape(-1,1))
    

    # Find top n most similar items
    # Filter for listings that have a similarity score of at least min_sim_score  - 200 ms
    concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>min_sim_score]
    # Filter for the top n MOST similar listings of those similar listings
    concurrent_similar_listings_df = concurrent_similar_listings_df.sort_values(by='similarity_score',ascending=False).head(top_n_items)
        
    if avg_type == 'median':
        try:
            avgStartPrice = np.median(concurrent_similar_listings_df['startPrice'])
        except:
            avgStartPrice = np.nan
    elif avg_type == 'mean':
        try:
            avgStartPrice = np.mean(concurrent_similar_listings_df['startPrice'])
        except:
            avgStartPrice = np.nan
        
    return avgStartPrice
                             
   

Test

In [69]:
np.mean(auctions.iloc[:200].apply(filter_fn, axis=1, args=('median',)))

Calculating concurrent similar median start price for #100 listing out of 29961
Calculating concurrent similar median start price for #200 listing out of 29961
Calculating concurrent similar median start price for #300 listing out of 29961
Calculating concurrent similar median start price for #400 listing out of 29961


136.83179878048782

**Apply Function and create pickle with new feature**

In [72]:
# Select average type
avg_type = 'mean'

In [73]:
auctions_median_start_price_series = auctions.apply(filter_fn, axis=1, args=(avg_type,))

Calculating concurrent similar median start price for #100 listing out of 29961
Calculating concurrent similar median start price for #200 listing out of 29961
Calculating concurrent similar median start price for #300 listing out of 29961
Calculating concurrent similar median start price for #400 listing out of 29961
Calculating concurrent similar median start price for #500 listing out of 29961
Calculating concurrent similar median start price for #600 listing out of 29961
Calculating concurrent similar median start price for #700 listing out of 29961
Calculating concurrent similar median start price for #800 listing out of 29961
Calculating concurrent similar median start price for #900 listing out of 29961
Calculating concurrent similar median start price for #1000 listing out of 29961
Calculating concurrent similar median start price for #1100 listing out of 29961
Calculating concurrent similar median start price for #1200 listing out of 29961
Calculating concurrent similar median

KeyboardInterrupt: 

In [None]:
auctions_median_start_price_series.to_pickle('/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/feature-engineering-concurrent-similar-median-start-price/pickles/auctions_median_start_price_series_{}.p'.format(avg_type))