In [3]:
from __future__ import division
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np
from difflib import SequenceMatcher
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text

pd.set_option('display.max_colwidth',100)

## Import Data

**Import cleaned auction data**

In [4]:
auctions = pd.read_pickle('../pickles/auctions.p')

**Clean Title Text**

In [28]:
auctions['title'] = auctions['title'].apply(clean_text)

## Extract Brands,Models from postgres table

In [22]:
dbname='ebay'
user='nathan'
host='localhost'
table_name='b_h_digital_camera_inventory'

conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
cur = conn.cursor()


# Get Brands
SQL = '''SELECT "Brand" FROM b_h_digital_camera_inventory'''.format(table_name=table_name)
brands = pd.read_sql_query(sql=SQL, con=conn)
brands = brands['Brand'].unique().tolist()

# Get Models
SQL = '''SELECT "Title" FROM b_h_digital_camera_inventory'''.format(table_name=table_name)
titles = pd.read_sql_query(sql=SQL, con=conn)
titles = titles['Title'].unique().tolist()


**Clean brands and models text**

In [38]:
brands = [brand.lower() for brand in brands]
bh_titles = [clean_text(title) for title in titles]

In [109]:
brands

['panasonic',
 'sony',
 'canon',
 'fujifilm',
 'nikon',
 'ricoh',
 'olympus',
 'leica',
 'pentax',
 'lytro',
 'kodak',
 'polaroid',
 'hasselblad',
 'dxo',
 'sigma',
 'vivitar',
 'yi technology',
 'panono',
 'minox',
 'sealife',
 'hamiltonbuhl',
 'horseman',
 'mamiya',
 'mamiya leaf']

## Find Brand
---

In [29]:
def find_brand(title):
    for brand in brands:
        if brand in title:
            return brand

In [30]:
auctions['brand'] = auctions['title'].apply(find_brand)

## Find Model 

### use NLP + Cosine Similarity to find listing Model
---

In [31]:
auctions['model'] = ''

** Delete Lens from Title **

In [77]:
bad_words = ['digital','camera','mm','lens','bag','sd','card','new',\
             'used','broken','cracked', 'kit','zoom','power','brand',\
            'package','bag','shutter','body','black','box','original',\
             'battery','charger','mp','accessories','dslr', 'slr', 'basic','kit','mirrorless']



def filter_title(title):
    for brand in brands:
        title = title.replace(brand, '')    
    for w in bad_words:
        title = title.replace(w, '')
    title = re.sub(r"\d+\-\d+","",title)
    title = re.sub(r"\d+\.\d+","",title)
    
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\-\s"," ",title)  # deleting individual dashes 
    return title

Test

In [78]:
test_title = auctions['title'].iloc[592]
test_title = auctions['title'].iloc[22]
test_title = auctions['title'].iloc[12]
print test_title


filter_title(test_title)

olympus e-3 10.1 mp digital slr camera - black body - 23 597 activations


u' e-3        activations'

**Filter listing titles to prepare for model extraction**

In [79]:
auctions['filtered_titles'] = auctions['title'].apply(filter_title)

**Filter bh photo titles to prepare for model extraction**

In [80]:
models = [filter_title(title).strip() for title in bh_titles]

**Train Vectorizer on available Models**

In [81]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                            min_df = 1,
                            max_features=5000)

# vectorizer = CountVectorizer(ngram_range=(1,1),
#                             min_df = 1,
#                             max_features=5000)

models_matrix = vectorizer.fit_transform(models)

models_df = pd.DataFrame(models_matrix.todense(), columns=vectorizer.vocabulary_.keys())
print models_df.shape

(717, 364)


** Transform Titles using Models Vectorizer **

In [82]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2),
#                             min_df = 10,
#                             max_features=5000)

titles_matrix = vectorizer.transform(auctions['filtered_titles'].tolist())
print titles_matrix.shape

(29961, 364)


**Create vectorized titles df**

In [83]:
titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())

titles_df = pd.concat(objs=[titles_df,auctions['title'],auctions['filtered_titles'],auctions['brand']], axis=1)

titles_df['similarity_score'] = None
titles_df['model_name'] = ''

titles_df.reset_index(inplace=True) # for using the index column to compare results with the auctions df

**Use Cosine Similarity to match title with model**

In [102]:
test_df = titles_df.copy()
print test_df.shape
for test_index in range(1000):
    title_vector = test_df.iloc[test_index, 1:-5].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    test_df['similarity_score'].iloc[test_index] = np.max(similarity_matrix)
    test_df['model_name'].iloc[test_index] = models[match_index]

(29961, 370)


In [85]:
models

[u'lumix dc-gh5 mirrorless micro four thirds',
 u'alpha a7r ii mirrorless',
 u'alpha a6500 mirrorless',
 u'alpha a5000 mirrorless',
 u'cyber-shot dsc-w800',
 u'eos 5d mark iv',
 u'x100f',
 u'x-t2 mirrorless',
 u'lumix dmc-g85 mirrorless micro four thirds',
 u'd3300     es',
 u'shot g7 x mark ii',
 u'd500',
 u'cyber-shot dsc-rx100 v',
 u'alpha a7s ii mirrorless',
 u'lumix dmc-zs50   silver',
 u'x100f   silver',
 u'alpha a5000 mirrorless     white',
 u'd750    storage',
 u'cyber-shot dsc-rx100 v',
 u'alpha a7s ii mirrorless   rode videomic pro',
 u'eos 7d mark ii    storage',
 u'eos 5d mark iv    storage',
 u'coolpix p900',
 u'theta spherical vr',
 u'alpha a99 ii',
 u'om-d e-m1 mark ii mirrorless micro four thirds',
 u'x-t20 mirrorless     silver',
 u'eos rebel t6     es',
 u'om-d e-m10 mark ii mirrorless micro four thirds    ii r  silver',
 u'dsc-w830   silver',
 u'alpha a6000 mirrorless',
 u'eos 5d mark iii    storage',
 u'lumix dc-fz80',
 u'shot g9 x',
 u'd3400     es',
 u'eos rebel t

In [93]:
for i,model in enumerate(models):
    if "eos" in model:
        print bh_titles[i]
        print model

eos 5d mark iv dslr camera body 
eos 5d mark iv
eos 7d mark ii dslr camera body storage kit
eos 7d mark ii    storage
eos 5d mark iv dslr camera body storage kit
eos 5d mark iv    storage
eos rebel t6 dslr camera 18-55mm 75-300mm lenses kit
eos rebel t6     es
eos 5d mark iii dslr camera body storage kit
eos 5d mark iii    storage
eos rebel t6i dslr camera 18-55mm lens
eos rebel t6i
eos 6d dslr camera 24-105mm f 4l lens storage kit
eos 6d    f 4l  storage
eos rebel t6i dslr camera 18-55mm 55-250mm lenses kit
eos rebel t6i     es
eos rebel t7i dslr camera body 
eos rebel t7i
eos 77d dslr camera body 
eos 77d
eos 80d dslr camera body 18-55mm 55-250mm lenses kit
eos 80d      es
eos 80d dslr camera 18-135mm lens video creator kit
eos 80d     video creator
eos 6d dslr camera body storage kit
eos 6d    storage
eos rebel t6i dslr camera 18-55mm lens video creator kit
eos rebel t6i     video creator
eos rebel t6 dslr camera 18-55mm lens
eos rebel t6
eos-1d x mark ii dslr camera body 
eos-1d x 

In [107]:
similarity_score_threshold = 0.7
test_df[test_df['similarity_score']>similarity_score_threshold].ix[:, ['title','filtered_titles','brand','similarity_score','model_name']]

Unnamed: 0,title,filtered_titles,brand,similarity_score,model_name
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,eos rebel sl1 eos 100d w stm,canon,0.826132,eos rebel sl1
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,cyber-shot dsc-rx100 cheap,sony,1,cyber-shot dsc-rx100 v
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,nib eos rebel t6 premium,canon,0.746143,eos rebel t6
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,eos rebel t6 bundle,canon,0.803468,eos rebel t6
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,lumix dmc-g7 mirrorless micro four thirds,panasonic,1,lumix dmc-g7 mirrorless micro four thirds
8,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii,sony,0.897523,cyber-shot dsc-rx100 ii
10,sony cyber-shot dsc-hx300 20.4mp digital camera - black. original box.,cyber-shot dsc-hx300 . .,sony,0.774455,cyber-shot dsc-rx100 v
11,as-is canon ds126491 eos rebel t5 digital slr camera kit w battery charger,as-is ds126491 eos rebel t5 w,canon,1,eos rebel t5
13,sony cyber-shot dsc-h300 digital camera,cyber-shot dsc-h300,sony,1,cyber-shot dsc-h300
16,sony cyber-shot dsc-tx20 16.2 mp digital camera - green,cyber-shot dsc-tx20 green,sony,0.774455,cyber-shot dsc-rx100 v


In [108]:
for model in test_df['filtered_titles'].tolist():
    if "dsc-hx300" in model:
        print model

 cyber-shot dsc-hx300    .  .
 dsc-hx300  
 cyber-shot dsc-hx300    
 dsc-hx300  
 cyber-shot dsc-hx300      w  case
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300     1080hd
 cyber-shot dsc-hx300    
 cyber-shot dsc-hx300     
 dsc-hx300
 cyber-shot dsc-hx300     works perfectly
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300  50x optical    
 dsc-hx300  
 cyber-shot dsc-hx300    
 cyber-shot dsc-hx300    
 cyber-shot dsc-hx300     
 dsc-hx300
 cyber-shot dsc-hx300    bonus 32gb 64gb  s
 cyber-shot dsc-hx300    
 cyber-shot dsc-hx300      extra 
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300    
 cyber-shot dsc-hx300     
 as-is  cyber-shot dsc-hx300    
 dsc-hx300 
 cyber-shot dsc-hx300  50x optical   
 cyber-shot dsc-hx300     
 cyber-shot dsc-hx300  


In [142]:
print auctions.ix[test_index,'title']
print titles_df.ix[test_index, ['title','filtered_titles']]

title_vector = titles_df.iloc[test_index, 1:-5].values

similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]

match_index = np.argmax(similarity_matrix)

print np.max(similarity_matrix)

models[match_index]

sony dsc-hx300 digital camera
title              sony dsc-hx300 digital camera
filtered_titles                       dc-hx300  
Name: 19, dtype: object
0.707106781187


'dsc-hx300'

In [20]:
for row_num in range(titles_df.shape[0]):
    if row_num+1 % 100 == 0:
        print 'Extracting model name for item #{} out of {}'.format(row_num, titles_df.shape[0])

    title_vector = titles_df.iloc[row_num, 1:-5].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    titles_df['similarity_score'] = np.max(similarity_matrix)
    
    titles_df['model_name'].iloc[row_num] = models[match_index]

Extracting model name for item #0 out of 29961


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Extracting model name for item #100 out of 29961
Extracting model name for item #200 out of 29961
Extracting model name for item #300 out of 29961
Extracting model name for item #400 out of 29961
Extracting model name for item #500 out of 29961
Extracting model name for item #600 out of 29961
Extracting model name for item #700 out of 29961
Extracting model name for item #800 out of 29961
Extracting model name for item #900 out of 29961
Extracting model name for item #1000 out of 29961
Extracting model name for item #1100 out of 29961
Extracting model name for item #1200 out of 29961
Extracting model name for item #1300 out of 29961
Extracting model name for item #1400 out of 29961
Extracting model name for item #1500 out of 29961
Extracting model name for item #1600 out of 29961
Extracting model name for item #1700 out of 29961
Extracting model name for item #1800 out of 29961
Extracting model name for item #1900 out of 29961
Extracting model name for item #2000 out of 29961
Extractin

**Results of NLP Model Extraction**

In [30]:
titles_df.ix[0:10,['title','filtered_titles','model_name']]

Unnamed: 0,title,filtered_titles,model_name
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,eo rebel l1 eo 100d lr w tm,sl1 / eos 100d
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,cyber-hot dc-rx100 cheap,cyber-shot dsc-rx100
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,nib eo rebel t6 lr premium,rebel t6
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,eo rebel t6 dlr bundle len d,rebel t6
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,nex e pz f3..6 o len,nex-f3
5,brand new lumix fz300 4k 24x f2.8 long zoom digital camera,fz300 4k 24x f long,hc-wx970k 4k
6,nikon d2h camera package shutter clicks 30 474,d2h packa hutter click,d2h
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,dmc-g7 mirrorle micro four third,dmc-g7
8,sony cyber-shot rx100 ii digital camera - black,cyber-hot rx100 ii,dsc-rx100 ii
9,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,d3300


## Find Lens Information

** Use Regex to extract lens information **

In [22]:
def find_lens(title):
    lenses = []
    
    lens = re.findall(r"\d+\-\d+m{2}", title)
    if lens:
        lenses.extend(lens)
        return lenses
    
    lens = re.findall(r"\d+m{2}", title)
    if lens:
        lenses.extend(lens)
        return lenses

    lens = re.findall(r"\d+\-\d+", title)
    if lens:
        lens = [l + 'mm' for l in lens]
        lenses.extend(lens)
        return lenses

**Extract**

In [23]:
titles_df['lens'] = titles_df['title'].apply(find_lens)

## Final Result

In [24]:
titles_df.ix[:50,['title','brand','model_name','similarity_score','lens']]

Unnamed: 0,title,brand,model_name,similarity_score,lens
0,canon eos rebel sl1 eos 100d 18.0 mp digital slr camera - black kit w stm,canon,sl1 / eos 100d,0.450392,
1,sony cyber-shot dsc-rx100 20.2mp digital camera - black brand new cheap,sony,cyber-shot dsc-rx100,0.450392,
2,new nib canon eos rebel t6 digital slr camera premium kit 18-55mm 75-300mm,canon,rebel t6,0.450392,"[18-55mm, 75-300mm]"
3,new canon eos rebel t6 dslr bundle 18-55mm 75-300mm lens bag sd card,canon,rebel t6,0.450392,"[18-55mm, 75-300mm]"
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoom lens kit camera,sony,nex-f3,0.450392,[16-50mm]
5,brand new lumix fz300 4k 24x f2.8 long zoom digital camera,lumix,hc-wx970k 4k,0.450392,
6,nikon d2h camera package shutter clicks 30 474,nikon,d2h,0.450392,
7,panasonic lumix dmc-g7 mirrorless micro four thirds digital camera body,panasonic,dmc-g7,0.450392,
8,sony cyber-shot rx100 ii digital camera - black,sony,dsc-rx100 ii,0.450392,
9,nikon d3300 18-55 vr ii kit new box,nikon,d3300,0.450392,[18-55mm]


In [25]:
models

['5d mark ii',
 '70d',
 'd40',
 'x100t',
 'd3100',
 'a7',
 'a6300',
 'd3000',
 'dmc-gh4',
 'a6000',
 '6d',
 't3 / eos 1100d',
 'd600',
 'l840',
 'd700',
 'd80',
 'xti / eos 400d',
 '40d',
 'xsi / eos 450d',
 'd90',
 'd60',
 '7d',
 'd70',
 't1i / eos 500d',
 'd7200',
 'd7100',
 't2i / eos 550d',
 'd200',
 '60d',
 'd50',
 'xt / eos 350d',
 't6i / eos d750',
 'j1',
 'd5000',
 'rebel xs',
 'd610',
 't3i / eos 600d',
 'd7000',
 'd3200',
 'rebel t3',
 'l340',
 'rx100 v',
 'd3300',
 'l830',
 '7d mark ii',
 'x-t1',
 'd300',
 'sx410 is',
 'xs / eos 1000d',
 '80d',
 'd5500',
 'x-t10',
 's100',
 'd40x',
 'sx50 hs',
 's110',
 '30d',
 'p510',
 'dmc-lx5',
 'g11',
 'sx40 hs',
 '5d',
 'dsc-h300',
 'sx610 hs',
 '50d',
 'sx530 hs',
 'rebel xs / 1000d',
 'd3',
 'd70s',
 '20d',
 'dsc-rx100',
 'b500',
 'p530',
 'xs / 1000d',
 'rebel xsi / 450d',
 'dsc-rx10 ii',
 'p520',
 't5i / eos 700d',
 'a350',
 'wb350f',
 'a7r',
 'd5100',
 'hero 3+ silver edition',
 'sx510 hs',
 'd5300',
 'a100',
 'l820',
 'a77 ii',
 '

**Test**

In [None]:
titles_df.ix[:50,['title','filtered_titles']]

Test Cases:
<br>18-55 
<br>28mm
<br>18-55mm 
<br>75-300mm

In [735]:
test_index = 9
test_title = titles_df['title'].iloc[test_index]
print test_title
lenses = []
# lens = re.findall(r"\d+\-\d+m{2}", test_title)
# lenses.extend(lens)
# print lenses
# print re.findall(r"\d+m{2}", test_title)
lenses = find_lens(test_title)
print lenses


nikon d3300 18-55 vr ii kit new box
[u'18-55']


**Development**

In [711]:
test_title = ' 18-55mm '
# test_title = ',28mm '
# test_title = ' 18-55'

print re.findall(r"\d+\-\d+m{2}", test_title)

print re.findall(r"\d+m{2}", test_title)

print re.findall(r"\d+\-\d+", test_title)






# print re.findall(r"\d+\-\d+mm",test_title)

# print re.findall(r"[^\w-]\d+mm",test_title)

# for match in re.findall(r"[^\w]?\d{2,3}\-\d{2,3}[^\w]?",test_title):
#     print match

['18-55mm']
['55mm']
['18-55']
