In [202]:
from __future__ import division
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np
from difflib import SequenceMatcher
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/data-analysis/utilities/')
from plot_learning_curve import plot_learning_curve
from clean_text import clean_text

## Import Data

**Import cleaned auction data**

In [226]:
auctions = pd.read_pickle('../pickles/auctions.p')

**Clean Title Text**

In [227]:
auctions['title'] = auctions['title'].apply(clean_text)

## Extract Brands,Models from postgres table

In [108]:
dbname='ebay'
user='nathan'
host='localhost'

conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
cur = conn.cursor()
# Get Brands
SQL = '''SELECT lower("Brand") as "Brands" FROM category_specifics GROUP BY lower("Brand") ORDER BY COUNT(lower("Brand")) DESC;'''
brands = pd.read_sql_query(sql=SQL, con=conn)
brands = brands.iloc[:-1]
brands = brands[brands['Brands'].apply(lambda x: len(x.split()) == 1)] # only get brands that are composed of 1 word
brands = brands[brands['Brands'].apply(lambda x: x!='na' and not ('-' in x))]
brands = brands['Brands'].tolist()



# Get Models
SQL = '''SELECT lower("Model") as "Model" FROM category_specifics GROUP BY lower("Model") ORDER BY COUNT(lower("Model")) DESC;'''
models = pd.read_sql_query(sql=SQL, con=conn)
models = models.iloc[:-1]
models = models['Model'].tolist()



## Find Brand
---

In [140]:
def find_brand(title):
    for brand in brands:
        if brand in title:
            return brand

In [229]:
auctions['brand'] = auctions['title'].apply(find_brand)

## Find Model 

### use NLP + Cosine Similarity to find listing Model
---

In [366]:
auctions['model'] = ''

** Delete Lens from Title **

In [446]:
test_title = auctions['title'].iloc[592]
test_title = auctions['title'].iloc[22]
test_title = auctions['title'].iloc[12]
print test_title

bad_words = ['digital','camera','mm','lens','bag','sd','card','new',\
             'used','broken','cracked', 'kit','zoom','power','brand',\
            'package','bag','shutter','body','black','box','original',\
             'battery','charger','slr','mp','accessories']

def filter_title(title):
    for brand in brands:
        title = title.replace(brand, '')    
    for w in bad_words:
        title = title.replace(w, '')
    title = re.sub(r"\d+\-\d+","",title)
    title = re.sub(r"\d+\.\d+","",title)
    
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\d+\s"," ",title) # deleting numbers
    title = re.sub(r"\s\-\s"," ",title)  # deleting individual dashes 
    return title

filter_title(test_title)

olympus e-3 10.1 mp digital slr camera - black body - 23 597 activations


u' e-3        activations'

Filter titles for bad words

In [447]:
auctions['filtered_titles'] = auctions['title'].apply(filter_title)

**Extract Some Models through regex**
<br>
specifically, those models that have the regex format \w-\w

In [448]:
# def extract_model_with_regex(title):
#     regex_expr = [r"\w+\-\w+\d+", r"\w+\-\d+", r"\d+\-\w+", r"\d+\-\w+\d+"]
#     for expr in regex_expr:
#         matches = re.findall(expr,title)
#         if matches:
#             return matches[0]

#     return ''

# auctions['model_name'] = auctions['filtered_titles'].apply(extract_model_with_regex)

** Train Vectorizer on titles **

In [459]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                            min_df = 10,
                            max_features=5000)

In [460]:
titles_matrix = vectorizer.fit_transform(auctions['filtered_titles'].tolist())

**Create vectorized titles df**

In [461]:
titles_df = pd.DataFrame(titles_matrix.todense(), columns=vectorizer.vocabulary_.keys())
titles_df = pd.concat(objs=[titles_df,auctions['title'],auctions['filtered_titles'],auctions['model_name']], axis=1)

**Create vectorized models df**

In [462]:
models_matrix = vectorizer.transform(models)

models_df = pd.DataFrame(models_matrix.todense(), columns=vectorizer.vocabulary_.keys())

**Calculate cosine similarity between each title and each model **

In [463]:
titles_df['similarity_score'] = np.nan

In [516]:
titles_df.reset_index(inplace=True) # for using the index column to compare results with the auctions df

In [517]:
titles_no_model_name_df = titles_df[titles_df['model_name']=='']
titles_with_model_name_df = titles_df[titles_df['model_name']!='']

In [518]:
titles_no_model_name_df

Unnamed: 0,index,four,excellent cond,7k oss,waterproof oran,a6300,s4100,d5100,cmos smart,slt a33,...,shipping,warranty,freeze,l810 26x,cmos,book,title,filtered_titles,model_name,similarity_score
0,0,0.0,0.0,0.0,0.000000,0.0,0.361782,0.444265,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,canon eos rebel sl1 eos 100d 18.0 mp digital s...,eos rebel sl1 eos 100d w stm,,
2,2,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,new nib canon eos rebel t6 digital slr camera ...,nib eos rebel t6 premium,,
3,3,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,new canon eos rebel t6 dslr bundle 18-55mm 75-...,eos rebel t6 d bundle,,
4,4,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoo...,nex e pz f3..6 oss,,
5,5,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,brand new lumix fz300 4k 24x f2.8 long zoom di...,fz300 4k 24x f long,,
6,6,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,nikon d2h camera package shutter clicks 30 474,d2h packa clicks,,
8,8,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii,,
9,9,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,,
11,11,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,as-is canon ds126491 eos rebel t5 digital slr...,as-is ds126491 eos rebel t5 w charr,,
15,15,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,nikon 1 j3 14.2mp digital camera body black,j3,,


**Use Cosine Similarity to match title with model**

In [566]:
for row_num in range(titles_no_model_name_df.shape[0]):
    title_vector = titles_no_model_name_df.iloc[row_num, 1:-4].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    titles_no_model_name_df['model_name'].iloc[row_num] = models[match_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


**Results of NLP Model Extraction**

In [569]:
titles_no_model_name_df.ix[:30,['title','filtered_titles','model_name']]

Unnamed: 0,title,filtered_titles,model_name
0,canon eos rebel sl1 eos 100d 18.0 mp digital s...,eos rebel sl1 eos 100d w stm,sl1 / eos 100d
2,new nib canon eos rebel t6 digital slr camera ...,nib eos rebel t6 premium,eos rebel t6
3,new canon eos rebel t6 dslr bundle 18-55mm 75-...,eos rebel t6 d bundle,eos rebel t6
4,sony nex 6 e pz 16-50mm f3.5-5.6 oss power zoo...,nex e pz f3..6 oss,nex-f3
5,brand new lumix fz300 4k 24x f2.8 long zoom di...,fz300 4k 24x f long,d40
6,nikon d2h camera package shutter clicks 30 474,d2h packa clicks,d2h
8,sony cyber-shot rx100 ii digital camera - black,cyber-shot rx100 ii,dsc-rx100 ii
9,nikon d3300 18-55 vr ii kit new box,d3300 vr ii,d3300
11,as-is canon ds126491 eos rebel t5 digital slr...,as-is ds126491 eos rebel t5 w charr,eos rebel t5
15,nikon 1 j3 14.2mp digital camera body black,j3,j3


**Results of Regex Model Extraction**

In [577]:
titles_with_model_name_df.head(40)

Unnamed: 0,index,four,excellent cond,7k oss,waterproof oran,a6300,s4100,d5100,cmos smart,slt a33,...,shipping,warranty,freeze,l810 26x,cmos,book,title,filtered_titles,model_name,similarity_score
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-rx100 20.2mp digital camer...,cyber-shot dsc-rx100 cheap,dsc-rx100,
7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,panasonic lumix dmc-g7 mirrorless micro four t...,dmc-g7 mirrorless micro four thirds,dmc-g7,
10,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-hx300 20.4mp digital camer...,cyber-shot dsc-hx300 . .,dsc-hx300,
12,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,olympus e-3 10.1 mp digital slr camera - black...,e-3 activations,e-3,
13,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-h300 digital camera,cyber-shot dsc-h300,dsc-h300,
14,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,canon eos 7d 18.0 mp digital slr camera w genu...,eos 7d w nuine bg-e7 grip,bg-e7,
16,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-tx20 16.2 mp digital camer...,cyber-shot dsc-tx20 green,dsc-tx20,
19,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony dsc-hx300 digital camera,dsc-hx300,dsc-hx300,
22,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,pentax pentax optio wg-3 16.0 mp digital camer...,optio wg-3 lots,wg-3,
33,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,olympus e-pl1 w 14-40 ex reserve,e-pl1 w ex reserve,e-pl1,


eos 7d w nuine bg-e7 grip -> bg-e7, index=14
<br>
j3 w 1nikor joo loc. z-7 -> z-7, index=139

**Test NLP Model Extraction on regex extracted models df**

In [579]:
titles_with_model_name_df.drop('similarity_score', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [581]:
titles_with_model_name_df['nlp_model_name'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [582]:
for row_num in range(titles_with_model_name_df.shape[0]):
    title_vector = titles_with_model_name_df.iloc[row_num, 1:-4].values
    
    similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]
    
    match_index = np.argmax(similarity_matrix)
    
    titles_with_model_name_df['nlp_model_name'].iloc[row_num] = models[match_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [584]:
titles_with_model_name_df.head(35)

Unnamed: 0,index,four,excellent cond,7k oss,waterproof oran,a6300,s4100,d5100,cmos smart,slt a33,...,shipping,warranty,freeze,l810 26x,cmos,book,title,filtered_titles,model_name,nlp_model_name
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-rx100 20.2mp digital camer...,cyber-shot dsc-rx100 cheap,dsc-rx100,cyber-shot dsc-rx100
7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,panasonic lumix dmc-g7 mirrorless micro four t...,dmc-g7 mirrorless micro four thirds,dmc-g7,dmc-g7
10,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-hx300 20.4mp digital camer...,cyber-shot dsc-hx300 . .,dsc-hx300,dsc-hx300
12,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,olympus e-3 10.1 mp digital slr camera - black...,e-3 activations,e-3,d40
13,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-h300 digital camera,cyber-shot dsc-h300,dsc-h300,dsc-h300
14,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,canon eos 7d 18.0 mp digital slr camera w genu...,eos 7d w nuine bg-e7 grip,bg-e7,7d
16,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony cyber-shot dsc-tx20 16.2 mp digital camer...,cyber-shot dsc-tx20 green,dsc-tx20,cyber-shot dsc-rx100
19,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,sony dsc-hx300 digital camera,dsc-hx300,dsc-hx300,dsc-hx300
22,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,pentax pentax optio wg-3 16.0 mp digital camer...,optio wg-3 lots,wg-3,optio wg-2
33,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,olympus e-pl1 w 14-40 ex reserve,e-pl1 w ex reserve,e-pl1,e-pl1


**Test and Development**

In [520]:
# for title_vector in titles_matrix.todense():     
test_index = 23
title_vector = titles_no_model_name_df.iloc[test_index, 1:-4].values

similarity_matrix = cosine_similarity(X=title_vector.reshape(1,-1), Y=models_matrix)[0]

model_match_index = np.argmax(similarity_matrix)

print np.max(similarity_matrix)
print titles_no_model_name_df['index'].iloc[test_index]
print titles_no_model_name_df['title'].iloc[test_index]
print titles_no_model_name_df['filtered_titles'].iloc[test_index]
print models[model_match_index]

# modelMatch = models[np.argmax(similarity_matrix)]
# modelMatchSimScore = 

0.64911851009
32
canon eos body
 eos 
eos digital rebel


In [524]:
auctions['title'].iloc[32]

u'canon eos body'

In [224]:
print modelMatchIndex

592
