In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 60)

Helper Fuctions

In [5]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

def clean_text(doc, remove_stop_words=True, remove_digits=False, remove_punc=True, stem=False):
    
    # 1. Remove any HTML markup
    text = BeautifulSoup(doc).get_text()  
    
    # 2. Extract special negator like n't
    text = re.sub('n\'t', ' not', text)
    
    # 3. remove punctuation(except .-)
    if remove_punc:
        text = re.sub('[^a-zA-Z.\-\d]', ' ', text)
        
    if remove_digits:
        text = re.sub('[.\d]', ' ', text)
        
    # 4. Convert to lower case 
    text = text.lower()
        
    # 5. Remove stop words
    if remove_stop_words:
        stops = set(stopwords.words("english"))
        text = [w for w in text.split(' ') if not w in stops]
        text = ' '.join(text)
                
    # 6. apply Porter Stemming
    # probably don't need this
    if stem:
        stemmer = PorterStemmer()
        stemmer = LancasterStemmer()
        text = [stemmer.stem(w) for w in text.split(' ')]
        text = ' '.join(text)
        
    # 7. Remove extra white space
    text = re.sub(' +',' ', text)
        
    return text

Import data

In [53]:
auctions = pd.read_pickle('./pickles/auctions.p')

Select features from which similarity will be calcualted  

In [56]:
# Select Features from which similarity will be calcualted 
title_series = auctions['title']
condition_id_series = auctions['condition.conditionId']

Take start/end times and start price features

In [23]:
# Necessary Features
start_price_series = auctions['startPrice']
start_time_series = auctions['listingInfo.startTime']
end_time_series = auctions['listingInfo.endTime']

Clean Text Features 

In [65]:
clean_titles = []
for i,title in enumerate(title_series.values):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_titles.append(clean_text(title))

cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents
cleaning #25000 out of 29961 documents


In [25]:
clean_conditions = []
for i,cond in enumerate(condition_combined):
    if (i+1)%5000==0:
        print 'cleaning #{} out of {} documents'.format(i+1,len(condition_combined))
    clean_conditions.append(clean_text(cond))

cleaning #5000 out of 29961 documents
cleaning #10000 out of 29961 documents
cleaning #15000 out of 29961 documents
cleaning #20000 out of 29961 documents
cleaning #25000 out of 29961 documents


**Vectorize text features**

In [66]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),
                             min_df=10,
                             analyzer='word',
                             stop_words=None,
                             max_features=10000,
                            )

titles_matrix = vectorizer.fit_transform(clean_titles)
titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

In [29]:
vectorizer = CountVectorizer(ngram_range = (1,2),
                             min_df=30,
                             analyzer='word',
                             stop_words=None,
                             max_features=5000,
                            )

conditions_matrix = vectorizer.fit_transform(clean_conditions)
conditions_df = pd.DataFrame(conditions_matrix.todense(), columns=vectorizer.vocabulary_.keys())

most common condition grams: [(u'zoom', 772), (u'years', 771), (u'year', 770), (u'wrong', 769), (u'wrist strap', 768)]
least common condition grams: [(u'000', 0), (u'10', 1), (u'100', 2), (u'12', 3), (u'14', 4)]


In [30]:
del clean_titles
del clean_conditions
del conditions_matrix
del titles_matrix

**Create new dataframe from pre-processed features**

In [67]:
# data_frames_to_keep = [titles_df, conditions_df, start_time_series, end_time_series, start_price_series]
data_frames_to_keep = [titles_df, condition_id_series, start_time_series, end_time_series, start_price_series]
auctions = pd.concat(data_frames_to_keep, axis=1)

In [58]:
# del title_series
# del condition_series
# del start_price_series
# del start_time_series
# del end_time_series

**Preprocessing imported dataframe**

Create medianConcurrentStartPrice column

In [128]:
auctions['medianConcurrentStartPrice'] = np.nan

In [59]:
# Sort dataframe by startTime DESC
auctions.sort_values(by='listingInfo.startTime', inplace=True)

In [138]:
auctions.reset_index(inplace=True)

**Find Concurrent Listings**

In [101]:
# Set threshold for similarity, as well as minimum number of similar items
top_n_items = 5
threshold = 0.999999

In [82]:
# Sample listing 
sample_index = 1000
current_listing = auctions.iloc[sample_index] 
listing_st = auctions['listingInfo.startTime'][sample_index] 

In [61]:
# subset dataframe
auctions_subset = auctions.iloc[:sample_index]

In [93]:
# Find concurrent listings
concurrent_listings_df = auctions_subset[auctions_subset.apply(lambda x: listing_st<x['listingInfo.endTime'], axis=1)]

In [94]:
# Calculate the similarity between current listing 
current_listing_vec = current_listing.iloc[1:-3].values.reshape(1,-1)
concurrent_listings_matrix = concurrent_listings_df.iloc[1:, :-3].values
cos_sim_matrix = cosine_similarity(current_listing_vec, concurrent_listings_matrix)
concurrent_listings_df.insert(loc=concurrent_listings_df.shape[1]-1, column='similarity_score', value=cos_sim_matrix.reshape(-1,1))

In [124]:
# Find top n most similar items
maxSimScore = max(concurrent_listings_df['similarity_score'])
for i,threshold in enumerate(np.arange(1, 0.9999000, -0.0000005)):
    # Filter for similar items
    numSimilarListings = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold].shape[0]
    if numSimilarListings >= top_n_items:        
        concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold]
        break

In [125]:
# Find median start price 
medianStartPrice = np.median(concurrent_similar_listings_df['startPrice'])

In [133]:
# Set median to dataframe
# auctions.loc[sample_index,'medianConcurrentStartPrice'] = medianStartPrice

In [None]:
def filter_fn(listing):
    top_n_items = 5
    threshold = 0.90

    current_listing = listing
    current_listing_index = listing['index']
    listing_st = current_listing['listingInfo.startTime']
    
    # subset dataframe
    auctions_subset = auctions.iloc[:current_listing_index]

    # Find concurrent listings
    concurrent_listings_df = auctions_subset[auctions_subset.apply(lambda x: listing_st<x['listingInfo.endTime'], axis=1)]

    # Calculate the similarity between current listing 
    current_listing_vec = current_listing.iloc[:-4].values.reshape(1,-1)
    concurrent_listings_matrix = concurrent_listings_df.iloc[:, :-4].values
    
#     print current_listing_vec.shape
#     print concurrent_listings_matrix.shape
    
    cos_sim_matrix = cosine_similarity(current_listing_vec, concurrent_listings_matrix)
    
#     print cos_sim_matrix.reshape(-1,1).shape
#     print concurrent_listings_df.shape
    
    concurrent_listings_df.insert(loc=concurrent_listings_df.shape[1]-1, column='similarity_score', value=cos_sim_matrix.reshape(-1,1))

    # Find top n most similar items
    maxSimScore = max(concurrent_listings_df['similarity_score'])
    for i,threshold in enumerate(np.arange(1, threshold, -0.05)):
        # Filter for similar items
        numSimilarListings = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold].shape[0]
        if numSimilarListings >= top_n_items:  
            concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>maxSimScore*threshold]
            break
        
        
    try:
        medianStartPrice = np.median(concurrent_similar_listings_df['startPrice'])
    except:
        medianStartPrice = np.nan
    return medianStartPrice
                             
                   

auctions.iloc[1000:6000].apply(filter_fn, axis=1)

In [None]:
    # Vrushank's Code
#     min_overlap = pd.to_timedelta('{} hour'.format(hours_overlap))
#     overlapping_listings = auctions.apply(lambda comp_list: 
#                                           min(comp_list['listingInfo.endTime'], listing['listingInfo.endTime']) - \
#                                           max(comp_list['listingInfo.startTime'], listing['listingInfo.startTime']) > \
#                                           min_overlap, axis=1)