In [24]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
df_vball = pd.read_csv('./richards_vball_products.csv')
df_vball.head()

Unnamed: 0,prodName,prodImage-src,prodPrice,prodBadge
0,P-TEX Ankle Brace With Stabilizers,https://dks.scene7.com/is/image/dkscdn/17PTEUP...,,DicksExclusive
1,Spalding King of the Beach USA Replica Outdoor...,https://dks.scene7.com/is/image/dkscdn/16SPLUS...,$24.99,
2,Wilson Graffiti Outdoor Volleyball,https://dks.scene7.com/is/image/dkscdn/16WILUG...,$19.99,
3,Tandem Round Volleyball Blocking Pads,https://dks.scene7.com/is/image/dkscdn/17TANUR...,$49.99,
4,ASICS Women's Gel-Rocket 9 Volleyball Shoes,https://dks.scene7.com/is/image/dkscdn/19ASIWG...,,


In [26]:
df = df_vball.iloc[:,[0,2,3]]

In [27]:
df.head()

Unnamed: 0,prodName,prodPrice,prodBadge
0,P-TEX Ankle Brace With Stabilizers,,DicksExclusive
1,Spalding King of the Beach USA Replica Outdoor...,$24.99,
2,Wilson Graffiti Outdoor Volleyball,$19.99,
3,Tandem Round Volleyball Blocking Pads,$49.99,
4,ASICS Women's Gel-Rocket 9 Volleyball Shoes,,


In [28]:
prodNames = df.iloc[:,0]
prodNames

0                     P-TEX Ankle Brace With Stabilizers
1      Spalding King of the Beach USA Replica Outdoor...
2                     Wilson Graffiti Outdoor Volleyball
3                  Tandem Round Volleyball Blocking Pads
4            ASICS Women's Gel-Rocket 9 Volleyball Shoes
                             ...                        
371    Nike Women's Dri-Fit Legend Classic Volleyball...
372                       Tandem Volleyball Net Extender
373                           Tandem Volleyball Headband
374              Adidas Women's HiLo Short Sleeve Jersey
375           Mizuno Men's Volleyball Attack T-Shirt 2.0
Name: prodName, Length: 376, dtype: object

In [29]:
# Write a function for cleaning strings and returning an array of ngrams
def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(5)])  # N-Gram length is 5
    return [''.join(ngram) for ngram in ngrams]

In [30]:
# Construct your vectorizer for building the TF-IDF matrix
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)

# Build the matrix
tfidf_matrix = vectorizer.fit_transform(prodNames)

In [31]:
# Import IGN's awesome_cossim_topn module
from sparse_dot_topn import awesome_cossim_topn

In [32]:
# The arguments for awesome_cossim_topn are as follows:
### 1. Our TF-IDF matrix
### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
### 4. This is our similarity threshold. Only values over 0.8 will be returned
cosine_matrix = awesome_cossim_topn(
  tfidf_matrix,
  tfidf_matrix.transpose(),
  prodNames.size,
  0.3
)

In [33]:
# Build a coordinate matrix from a cosine matrix
coo_matrix = cosine_matrix.tocoo()

In [34]:
# Instaniate our lookup hash table
group_lookup = {}

def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None

In [35]:
def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)

In [36]:
# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        # Note that what is passed to add_pair_to_lookup is the string at each index
        # (eg: the names in the legal_name column) not the indices themselves
        add_pair_to_lookup(prodNames[row], prodNames[col])

In [37]:
df['Group'] = df['prodName'].map(group_lookup).fillna(df['prodName'])

In [38]:
df.head()

Unnamed: 0,prodName,prodPrice,prodBadge,Group
0,P-TEX Ankle Brace With Stabilizers,,DicksExclusive,Shock Doctor Ultra Wrap Lace Ankle Brace
1,Spalding King of the Beach USA Replica Outdoor...,$24.99,,Spalding King of the Beach USA Replica Outdoor...
2,Wilson Graffiti Outdoor Volleyball,$19.99,,Wilson Graffiti Outdoor Volleyball
3,Tandem Round Volleyball Blocking Pads,$49.99,,Tandem Round Volleyball Blocking Pads
4,ASICS Women's Gel-Rocket 9 Volleyball Shoes,,,Mizuno Women's Wave Voltage Volleyball Shoes


In [39]:
df['Group'].unique()

array(['Shock Doctor Ultra Wrap Lace Ankle Brace',
       'Spalding King of the Beach USA Replica Outdoor Volleyball',
       'Wilson Graffiti Outdoor Volleyball',
       'Tandem Round Volleyball Blocking Pads',
       "Mizuno Women's Wave Voltage Volleyball Shoes",
       'Champion Volleyball Trainer Set',
       "Nike Women's React Hyperset Volleyball Shoes",
       'Tandem Collapsible Spike Trainer',
       'DSG Solana Indoor/Outdoor Volleyball',
       "DICK'S Sporting Goods Mesh Ball Bag", 'Tandem Volleyball Pal',
       "Nike Women's Swoosh Logo Printed Sports Bra",
       'Tachikara SV-5WM Indoor Volleyball',
       'Mizuno Core Flat Front Vortex Hybrid 3.5" Volleyball Shorts',
       'Wilson Pro Tour Indoor Volleyball',
       'Wilson K1 Silver Indoor Volleyball', "Nike Girls' Fury Headband",
       'Tandem Volleyball Target Challenger',
       'Tandem Volleyball Small Reaction Ball',
       'Wilson AVP Special Edition II Outdoor Volleyball',
       'Tandem Volleyball Ponytail 

In [40]:
df.to_csv('./groups_threshold_10percent.csv')