In [1]:
from __future__ import division
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np
from difflib import SequenceMatcher
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text

pd.set_option('display.max_colwidth',100)

## Import Data

**Import cleaned auction data**

In [2]:
auctions = pd.read_pickle('../pickles/auctions.p')

**Clean Title Text**

In [3]:
auctions['title'] = auctions['title'].apply(clean_text)

## Extract Brands,Models from postgres table

In [4]:
dbname='ebay'
user='nathan'
host='localhost'
table_name='category_specifics'

conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
cur = conn.cursor()


# Get Brands
SQL = '''SELECT lower("Brand") as "Brand" FROM {table_name} GROUP BY lower("Brand") HAVING COUNT(lower("Brand")) >= 10 ORDER BY COUNT(lower("Brand")) DESC;'''.format(table_name=table_name)
brands = pd.read_sql_query(sql=SQL, con=conn)
brands = brands.iloc[:-1] # get rid of "None" row
brands = brands[brands['Brand'].apply(lambda x: len(x.split()) == 1)] # Valid Brand names should only consists of 1 word
brands = brands[brands['Brand'].apply(lambda x: x!='na' and not ('-' in x))] 
brands = brands['Brand'].tolist()

# Get Models
SQL = '''SELECT lower("Model") as "Model",COUNT(lower("Model")) FROM category_specifics GROUP BY lower("Model") HAVING COUNT(lower("Model")) >= 2 ORDER BY COUNT(lower("Model")) DESC;'''
models = pd.read_sql_query(sql=SQL, con=conn)
models = models.iloc[:-1] # get rid of "None" row
models = models['Model'].tolist()

# Get Series
SQL = '''SELECT lower("Series") as "Series" FROM {table_name} GROUP BY lower("Series") ORDER BY COUNT(lower("Series")) DESC;'''.format(table_name=table_name)
series = pd.read_sql_query(sql=SQL, con=conn)

In [5]:
print len(brands)
print len(models)

17
557


## Find Brand
---

In [9]:
def find_brand(title):
    for brand in brands:
        if brand in title:
            return brand

In [10]:
auctions['brand'] = auctions['title'].apply(find_brand)

## Find Model 

### use NLP + Cosine Similarity to find listing Model
---

In [11]:
auctions['model'] = ''

** Delete Lens from Title **

In [12]:
bad_words = ['digital','camera','mm','lens','bag','sd','card','new',\
             'used','broken','cracked', 'kit','zoom','power','brand',\
            'package','bag','shutter','body','box','original',\
             'battery','charger','mp','accessories','dslr', 'slr', 'basic','kit','mirrorless','lr']


def filter_title(title):
    for brand in brands:
        title = title.replace(brand, '')    
    for w in bad_words:
        title = title.replace(w, '')
    title = re.sub(r"\d+\-\d+","",title)
    title = re.sub(r"\d+\.\d+","",title)
    
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\-\s"," ",title)  # deleting individual dashes 
    return title

--- Test --- 

In [13]:
test_title = auctions['title'].iloc[592]
test_title = auctions['title'].iloc[22]
test_title = auctions['title'].iloc[12]
print test_title

filter_title(test_title)

olympus e-3 10.1 mp digital slr camera - black body - 23 597 activations


u' e-3      black  activations'

**Filter titles for bad words**

In [14]:
auctions['filtered_titles'] = auctions['title'].apply(filter_title)

**Train Vectorizer on available Models**

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                            min_df = 1,
                            max_features=5000)


models_matrix = vectorizer.fit_transform(models)

models_df = pd.DataFrame(models_matrix.todense(), columns=vectorizer.vocabulary_.keys())
print models_df.shape

(557, 494)


** Transform Titles using Models Vectorizer **

In [16]:
titles_matrix = vectorizer.transform(auctions['filtered_titles'].tolist())
print titles_matrix.shape

(29961, 494)


**Create vectorized titles df**

In [17]:
titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

titles_df = pd.concat(objs=[titles_df,auctions['title'],auctions['filtered_titles'],auctions['brand']], axis=1)

titles_df['similarity_score'] = None
titles_df['model_name'] = ''

titles_df.reset_index(inplace=True) # for using the index column to compare results with the auctions df

**Use Cosine Similarity to match title with model**

--- DEVELOPMENT ---

In [15]:
test_df = titles_df.copy()

for test_index in range(100):
    title_vector = test_df.iloc[test_index, 1:-5].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    test_df['similarity_score'].iloc[test_index] = np.max(similarity_matrix)
    test_df['model_name'].iloc[test_index] = models[match_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
test_df[test_df['similarity_score']>0].ix[:100, ['title','filtered_titles','brand','similarity_score','model_name']]

Unnamed: 0,title,filtered_titles,brand,similarity_score,model_name
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,eos rebel sl1 eos 100d black w stm,canon,0.804452,sl1 / eos 100d
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,cyber-shot dsc-rx100 black cheap,sony,1,dsc-rx100
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,nib eos rebel t6 premium,canon,1,eos rebel t6
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,eos rebel t6 bundle,canon,1,eos rebel t6
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,nex e pz f3..6 oss,sony,1,nex-f3
5,brand new lumix fz300 4k 24x f2.8 long zoom digital camera,lumix fz300 4k 24x f long,,0.677605,lumix dmc-gh1
6,nikon d2h camera package shutter clicks 30 474,d2h clicks,nikon,1,d2h
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,lumix dmc-g7 micro four thirds,panasonic,0.705792,dmc-g7
8,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii black,sony,0.883035,dsc-rx100 ii
9,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,nikon,0.820714,d3300


--- TEST ---

In [142]:
print auctions.ix[test_index,'title']
print titles_df.ix[test_index, ['title','filtered_titles']]

title_vector = titles_df.iloc[test_index, 1:-5].values

similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]

match_index = np.argmax(similarity_matrix)

print np.max(similarity_matrix)

models[match_index]

sony dsc-hx300 digital camera
title              sony dsc-hx300 digital camera
filtered_titles                       dc-hx300  
Name: 19, dtype: object
0.707106781187


'dsc-hx300'

** Run Model Extraction on Dataframe**

In [18]:
for row_num in range(titles_df.shape[0]):
# for row_num in range(101):
    if (row_num+1) % 100 == 0:
        print 'Extracting model name for item #{} out of {}'.format(row_num+1, titles_df.shape[0])

    title_vector = titles_df.iloc[row_num, 1:-5].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    titles_df['similarity_score'].iloc[row_num] = np.max(similarity_matrix)
    titles_df['model_name'].iloc[row_num] = models[match_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Extracting model name for item #100 out of 29961
Extracting model name for item #200 out of 29961
Extracting model name for item #300 out of 29961
Extracting model name for item #400 out of 29961
Extracting model name for item #500 out of 29961
Extracting model name for item #600 out of 29961
Extracting model name for item #700 out of 29961
Extracting model name for item #800 out of 29961
Extracting model name for item #900 out of 29961
Extracting model name for item #1000 out of 29961
Extracting model name for item #1100 out of 29961
Extracting model name for item #1200 out of 29961
Extracting model name for item #1300 out of 29961
Extracting model name for item #1400 out of 29961
Extracting model name for item #1500 out of 29961
Extracting model name for item #1600 out of 29961
Extracting model name for item #1700 out of 29961
Extracting model name for item #1800 out of 29961
Extracting model name for item #1900 out of 29961
Extracting model name for item #2000 out of 29961
Extractin

**Results of NLP Model Extraction**

In [24]:
# titles_df[titles_df['similarity_score']>0.7]
titles_df.ix[0:200,['title','filtered_titles','similarity_score','model_name']]

Unnamed: 0,title,filtered_titles,similarity_score,model_name
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,eos rebel sl1 eos 100d black w stm,0.804452,sl1 / eos 100d
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,cyber-shot dsc-rx100 black cheap,1,dsc-rx100
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,nib eos rebel t6 premium,1,eos rebel t6
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,eos rebel t6 bundle,1,eos rebel t6
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,nex e pz f3..6 oss,1,nex-f3
6,nikon d2h camera package shutter clicks 30 474,d2h clicks,1,d2h
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,lumix dmc-g7 micro four thirds,0.705792,dmc-g7
8,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii black,0.883035,dsc-rx100 ii
9,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,0.820714,d3300
10,sony cyber-shot dsc-hx300 20.4mp digital camera - black. original box.,cyber-shot dsc-hx300 black. .,1,dsc-hx300


## Find Lens Information

** Use Regex to extract lens information **

In [25]:
def find_lens(title):
    lenses = []
    
    lens = re.findall(r"\d+\-\d+m{2}", title)
    if lens:
        lenses.extend(lens)
        return lenses
    
    lens = re.findall(r"\d+m{2}", title)
    if lens:
        lenses.extend(lens)
        return lenses

    lens = re.findall(r"\d+\-\d+", title)
    if lens:
        lens = [l + 'mm' for l in lens]
        lenses.extend(lens)
        return lenses

**Extract**

In [26]:
titles_df['lens'] = titles_df['title'].apply(find_lens)

In [27]:
titles_df['has_lens'] = titles_df['lens'].apply(lambda x: 0 if x==None else 1)

## Final Result

In [29]:
titles_df.ix[:50,['title','brand','model_name','similarity_score','lens','has_lens']]

Unnamed: 0,title,brand,model_name,similarity_score,lens,has_lens
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,canon,sl1 / eos 100d,0.804452,,0
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,sony,dsc-rx100,1.0,,0
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,canon,eos rebel t6,1.0,"[18-55mm, 75-300mm]",1
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,canon,eos rebel t6,1.0,"[18-55mm, 75-300mm]",1
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,sony,nex-f3,1.0,[16-50mm],1
5,brand new lumix fz300 4k 24x f2.8 long zoom digital camera,,lumix dmc-gh1,0.677605,,0
6,nikon d2h camera package shutter clicks 30 474,nikon,d2h,1.0,,0
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,panasonic,dmc-g7,0.705792,,0
8,sony cyber-shot rx100 ii digital camera - black,sony,dsc-rx100 ii,0.883035,,0
9,nikon d3300 18-55 vr ii kit new box,nikon,d3300,0.820714,[18-55mm],1


## Merge new features with auctions

In [31]:
titles_df.ix[:10, ['index','title','filtered_titles','brand','similarity_score','model_name','lens','has_lens']]

Unnamed: 0,index,title,filtered_titles,brand,similarity_score,model_name,lens,has_lens
0,0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,eos rebel sl1 eos 100d black w stm,canon,0.804452,sl1 / eos 100d,,0
1,1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,cyber-shot dsc-rx100 black cheap,sony,1.0,dsc-rx100,,0
2,2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,nib eos rebel t6 premium,canon,1.0,eos rebel t6,"[18-55mm, 75-300mm]",1
3,3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,eos rebel t6 bundle,canon,1.0,eos rebel t6,"[18-55mm, 75-300mm]",1
4,4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,nex e pz f3..6 oss,sony,1.0,nex-f3,[16-50mm],1
5,5,brand new lumix fz300 4k 24x f2.8 long zoom digital camera,lumix fz300 4k 24x f long,,0.677605,lumix dmc-gh1,,0
6,6,nikon d2h camera package shutter clicks 30 474,d2h clicks,nikon,1.0,d2h,,0
7,7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,lumix dmc-g7 micro four thirds,panasonic,0.705792,dmc-g7,,0
8,8,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii black,sony,0.883035,dsc-rx100 ii,,0
9,9,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,nikon,0.820714,d3300,[18-55mm],1


In [38]:
auctions['similarity_score'] = titles_df['similarity_score']
auctions['model'] = titles_df['model_name']
auctions['brand'] = titles_df['brand']
auctions['lens'] = titles_df['lens']
auctions['has_lens'] = titles_df['has_lens']

In [39]:
auctions.ix[:1]

Unnamed: 0,index,id,timestamp,itemId,topRatedListing,globalId,title,subtitle,country,primaryCategory.categoryId,...,startPrice,sold_state,conditionAvailable,conditionCombined,brand,model,filtered_titles,similarity_score,lens,has_lens
0,1,78584,2017-04-06 03:20:46.048000+00:00,322461255962,0,EBAY-US,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,Buy Direct from Best Buy,US,31388,...,399.0,0,0,New,canon,sl1 / eos 100d,eos rebel sl1 eos 100d black w stm,0.804452,,0
1,5,78567,2017-04-06 03:20:46.048000+00:00,332163381834,0,EBAY-US,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,Refurbished with 90 Day Nikon Warranty,US,31388,...,300.0,1,0,New,sony,dsc-rx100,cyber-shot dsc-rx100 black cheap,1.0,,0


**Export Auctions dataframe**

In [40]:
auctions.to_pickle('../pickles/auctions_brand_model_hlens.p')

--- Test ---

In [None]:
titles_df.ix[:50,['title','filtered_titles']]

Test Cases:
<br>18-55 
<br>28mm
<br>18-55mm 
<br>75-300mm

In [735]:
test_index = 9
test_title = titles_df['title'].iloc[test_index]
print test_title
lenses = []
# lens = re.findall(r"\d+\-\d+m{2}", test_title)
# lenses.extend(lens)
# print lenses
# print re.findall(r"\d+m{2}", test_title)
lenses = find_lens(test_title)
print lenses


nikon d3300 18-55 vr ii kit new box
[u'18-55']


**Development**

In [711]:
test_title = ' 18-55mm '
# test_title = ',28mm '
# test_title = ' 18-55'

print re.findall(r"\d+\-\d+m{2}", test_title)

print re.findall(r"\d+m{2}", test_title)

print re.findall(r"\d+\-\d+", test_title)






# print re.findall(r"\d+\-\d+mm",test_title)

# print re.findall(r"[^\w-]\d+mm",test_title)

# for match in re.findall(r"[^\w]?\d{2,3}\-\d{2,3}[^\w]?",test_title):
#     print match

['18-55mm']
['55mm']
['18-55']
