## Content Based Filtering

Uses similarities between patterns based on angular distance of eudlidean distance of pattern features. Feature engineering for this recommender can have a large inpact on results.


look here wh

https://github.com/JimKing100/strains-live/blob/master/model/tf_knn.ipynb
(above good example)

below numpy
https://towardsdatascience.com/building-a-content-based-recommender-for-data-science-articles-728e5ec7d63d

In [1]:
# import libraries
import pandas as pd
import numpy as np
import ast

# For numerically encoding and preprocessing patterns in order to compare similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion

# similarity metrics 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import euclidean_distances

# Visulaization of pipeline
from sklearn import set_config

from util_functions import *


In [2]:
# import data
df = pd.read_csv('data/patterns_cleaned.csv', low_memory=False)
pd.options.mode.chained_assignment = None 

In [178]:
# DROP OUTLIERS THAT AFFECT SCALING! 

In [5]:
df.shape

(132843, 24)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132843 entries, 0 to 132842
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   pattern_id               132843 non-null  int64  
 1   name                     132843 non-null  object 
 2   name_permalink           132843 non-null  object 
 3   favorites_count          132843 non-null  int64  
 4   projects_count           132843 non-null  int64  
 5   difficulty_average       132843 non-null  float64
 6   difficulty_count         131547 non-null  float64
 7   rating_average           132843 non-null  float64
 8   queued_projects_count    132843 non-null  int64  
 9   rating_count             132841 non-null  float64
 10  pattern_type_names       132842 non-null  object 
 11  pattern_type_clothing    132842 non-null  object 
 12  photos_url               132843 non-null  object 
 13  pattern_needle_sizes     132843 non-null  object 
 14  patt

### Processing Pipeline

In [4]:
#TODO
# weight variables - depending on importance! 
# impute nan from a dictionary - eg average of pattern types yarn weight, needle 

categorical_features = ['free', 'pattern_type_names',  'coded_year']#,'downloadable', 'coded_month',]
numeric_features = ['yardage', 'difficulty_average','gauge_per_inch', 'yardage_avg',]
custom_function_pre_encoded_features = ['yarn_weight_description', 'needle_sizes']

custom_function_transformer = Pipeline(steps=[
                                    ("cosolidate_gauge", DataframeFunctionTransformer(consolidate_gauge)),
                                    ("use_avg_yardage", DataframeFunctionTransformer(use_avg_yardage)),
                                    ("encode_yarn_weights", DataframeFunctionTransformer(encode_yarn_weights)),
                                    ("get_needle_size", DataframeFunctionTransformer(get_needle_size)), 
#                                     ("encode_months",DataframeFunctionTransformer(code_months)), 
                                    ("encode_years",DataframeFunctionTransformer(code_years)), 
                              # NOTE NEED TO DOWNWEIGHT THESE!!
                                    ]) 

attributes_transformer = Pipeline(steps=[("get_corpus", FunctionTransformer(get_corpus)),
                                    ('count_vectorize_attributes', CountVectorizer()),
                                    ('to_dense', ToDenseTransformer())]) 


numeric_transformer = Pipeline(steps=[('impute_mode', SimpleImputer(strategy='median')), 
                                      ('scaling', StandardScaler())]) 


categorical_transformer = Pipeline(steps=[('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                          ('one-hot-encode', OneHotEncoder(sparse=False))])

pre_encoded_feature_transformer = Pipeline(steps=[
                                    ('impute_mode', SimpleImputer(strategy='median'))])
                        

preprocessor = ColumnTransformer(
               transformers=[('pre-ecoded_features', pre_encoded_feature_transformer, custom_function_pre_encoded_features),
                             ('numeric', numeric_transformer, numeric_features),
                             ('categorical', categorical_transformer, categorical_features)]) 


main_pipeline = Pipeline(steps = [('custom_feature_transform', custom_function_transformer),
                            ('preprocessor', preprocessor)])

pipeline = FeatureUnion([('main_pipeline', main_pipeline),
                            ('attributes', attributes_transformer)])


In [5]:
# Visualize Pipeline
set_config(display='diagram')
pipeline

In [6]:
# Fit and transform data to format to be used for similarity comparison
df = pd.read_csv('data/patterns_cleaned.csv', low_memory=False)
X = pipeline.fit_transform(df)
X.shape

(132843, 61)

## Calculating Recommendations:

### a) Euclidean Distance
We want to find the patterns closest distance-wise to the pattern we are using to compare.  This is why scaling important as the magnitudes could skew the vectors away from similar patterns, or strengthen important attributes. 

In [26]:
def find_top_eucliedean_recommendations_df(name_permalink):
  
    # Find index
    try:
        pattern_to_compare = X[get_index_from_name_permalink(name_permalink,df)] 
    except:
        # transform through preprocessing pipeline
        print("pattern wasn't processed yet - try to process it now")
        pattern_to_compare  = get_pattern_metadata_from_url(pattern_url, df)
#         need to download single pattern 
        pattern_to_compare = pipeline.transform(pattern_to_compare)
        pattern_to_compare = get_metadata_from_name_permalink(name_permalink, df)

    # Get distances from all other patterns
    distances = euclidean_distances(X, pattern_to_compare)
    distances = distances.reshape(-1)   
    df['distances'] = distances
    
    # Find N number of indices with the least distance to chosen pattern 
    ordered_indices = distances.argsort()
    closest_indices = ordered_indices[:20]

    # # Get the patterns for these indices
#     closest_df = df.iloc[closest_indices]
    closest_df = df.iloc[ordered_indices]
    closest_df['rank'] = df['distances'].rank()
    return closest_df

def list_top_euclidean_recommendations(df):
    df =df[0:20]
    name_permalink = []
    image_url = []
    url = []
    distances = []
    for i in range(df.shape[0]):
        name_permalink.append(df.name_permalink.iloc[i])
        image_url.append(df.photos_url.iloc[i])
        distances.append(df.distances.iloc[i])
        url.append('https://www.ravelry.com/patterns/library/' +df.name_permalink.iloc[i])
    return name_permalink, image_url, url, distances

def print_top_euclidean_recommendations(name_permalink_list, image_url, url, distances):
    for i in range(len(name_permalink_list)):
        print(f'{name_permalink_list[i]},\t {url[i]}, \t {distances[i]:.4f}')

#### Recommend Patterns:

In [28]:
name_permalink='jasmine-the-giraffe'

recommended_df= find_top_eucliedean_recommendations_df(name_permalink)
name_permalink_list, image_url, url, distances = list_top_euclidean_recommendations(recommended_df)
print_top_euclidean_recommendations(name_permalink_list,  image_url,  url, distances)

jasmine-the-giraffe,	 https://www.ravelry.com/patterns/library/jasmine-the-giraffe, 	 0.0000
tea-for-three,	 https://www.ravelry.com/patterns/library/tea-for-three, 	 0.2946
loopy-sheep-2,	 https://www.ravelry.com/patterns/library/loopy-sheep-2, 	 0.9193
cyber-sam,	 https://www.ravelry.com/patterns/library/cyber-sam, 	 1.0420
night-owl-2,	 https://www.ravelry.com/patterns/library/night-owl-2, 	 1.0420
run-run-rudolph,	 https://www.ravelry.com/patterns/library/run-run-rudolph, 	 1.0423
bravo-bunnies,	 https://www.ravelry.com/patterns/library/bravo-bunnies, 	 1.0432
bunny-babes--beach-boys-part-2,	 https://www.ravelry.com/patterns/library/bunny-babes--beach-boys-part-2, 	 1.0456
tooth-trader,	 https://www.ravelry.com/patterns/library/tooth-trader, 	 1.0484
gnome-and-gnomette,	 https://www.ravelry.com/patterns/library/gnome-and-gnomette, 	 1.0505
frog-prince-3,	 https://www.ravelry.com/patterns/library/frog-prince-3, 	 1.0520
artie-the-painter,	 https://www.ravelry.com/patterns/library/ar

In [23]:
# Recommended metadata
recommended_df.head(10)

Unnamed: 0,pattern_id,name,name_permalink,favorites_count,projects_count,difficulty_average,difficulty_count,rating_average,queued_projects_count,rating_count,...,gauge,gauge_divisor,free,downloadable,categories,yarn_weight_description,gauge_per_inch,yardage_avg,distances,rank
39027,208922,Lara,lara-7,2691,121,5.048387,62.0,4.312500,600,64.0,...,27.00,4.0,False,True,"['bonnet', 'hat', 'accessories']",DK (11 wpi),6.75,459.0,0.000000,1.0
34451,146458,Wylde Oats Hat,wylde-oats-hat,610,20,3.909091,11.0,4.000000,85,10.0,...,25.00,4.0,False,True,"['other-hat', 'hat', 'accessories']",DK (11 wpi),6.25,,1.073629,2.0
107592,36364,Melissa's Hat,melissas-hat,121,38,4.150000,20.0,4.526316,14,19.0,...,24.00,4.0,False,True,"['earflap', 'hat', 'accessories']",DK (11 wpi),6.00,,1.443996,3.0
33983,140911,Hugo Hat,hugo-hat,155,74,5.393939,33.0,3.571429,40,28.0,...,26.00,4.0,False,True,"['billed', 'hat', 'accessories']",DK (11 wpi),6.50,492.0,1.455614,4.0
110102,135873,Lubushka Hat,lubushka-hat,1790,14,5.666667,3.0,5.000000,168,3.0,...,5.50,1.0,False,True,"['cloche', 'hat', 'accessories']",DK (11 wpi),5.50,175.0,1.515652,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52146,532635,Pathway Home,pathway-home,138,13,1.750000,4.0,4.200000,20,5.0,...,57.00,1.0,True,True,"['beanie-toque', 'hat', 'accessories']",Worsted (9 wpi),57.00,142.0,30.896642,132839.0
73991,884801,JederMANNs Kurt,jedermanns-kurt,166,14,3.250000,4.0,4.800000,23,5.0,...,24.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Sport (12 wpi),6.00,8913.0,32.625840,132840.0
92541,429626,The Eagle has Landed,the-eagle-has-landed,215,40,1.562500,16.0,4.882353,33,17.0,...,99.99,1.0,True,True,"['washcloth', 'cleaning', 'home']",Worsted (9 wpi),99.99,94.0,57.025225,132841.0
15871,1729,Infant Kimono,infant-kimono,26,11,1.625000,8.0,3.714286,5,7.0,...,99.99,1.0,False,False,"['cardigan', 'sweater', 'clothing']",Aran / Worsted,99.99,250.0,57.049987,132842.0


Let's try another:

In [30]:
name_permalink='professor-meow-sweater'

recommended_df= find_top_eucliedean_recommendations_df(name_permalink)
name_permalink_list, image_url, url, distances = list_top_euclidean_recommendations(recommended_df)
print_top_euclidean_recommendations(name_permalink_list,  image_url,  url, distances)

professor-meow-sweater,	 https://www.ravelry.com/patterns/library/professor-meow-sweater, 	 0.0000
haiyuki,	 https://www.ravelry.com/patterns/library/haiyuki, 	 0.6009
gertrude-8,	 https://www.ravelry.com/patterns/library/gertrude-8, 	 0.6146
julia-yoke-sweater,	 https://www.ravelry.com/patterns/library/julia-yoke-sweater, 	 0.6191
vejr,	 https://www.ravelry.com/patterns/library/vejr, 	 0.6340
soca,	 https://www.ravelry.com/patterns/library/soca, 	 0.6593
aosta-sweater,	 https://www.ravelry.com/patterns/library/aosta-sweater, 	 0.6931
shop-frock,	 https://www.ravelry.com/patterns/library/shop-frock, 	 0.7219
anyday-sweater,	 https://www.ravelry.com/patterns/library/anyday-sweater, 	 0.7654
wildflower-and-moss-sweater,	 https://www.ravelry.com/patterns/library/wildflower-and-moss-sweater, 	 0.7716
novice-sweater---chunky-edition,	 https://www.ravelry.com/patterns/library/novice-sweater---chunky-edition, 	 0.7743
sweater-no-9,	 https://www.ravelry.com/patterns/library/sweater-no-9, 	 0.7

In [31]:
# Recommended metadata
recommended_df.head(10)

Unnamed: 0,pattern_id,name,name_permalink,favorites_count,projects_count,difficulty_average,difficulty_count,rating_average,queued_projects_count,rating_count,...,gauge,gauge_divisor,free,downloadable,categories,yarn_weight_description,gauge_per_inch,yardage_avg,distances,rank
56673,745474,Professor Meow Sweater,professor-meow-sweater,6099,526,2.864198,162.0,4.596386,743,166.0,...,14.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Bulky (7 wpi),3.50,923.0,0.000000,1.0
60075,886340,haiyuki,haiyuki,2804,75,2.636364,33.0,4.914286,212,35.0,...,13.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Bulky (7 wpi),3.25,840.0,0.600874,2.0
61837,940002,Gertrude,gertrude-8,424,21,2.500000,4.0,4.500000,36,4.0,...,11.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Bulky (7 wpi),2.75,924.0,0.614629,3.0
60777,1105517,Julia Yoke Sweater,julia-yoke-sweater,94,12,3.000000,4.0,5.000000,10,4.0,...,15.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Bulky (7 wpi),3.75,780.0,0.619078,4.0
6941,978274,VEJR,vejr,288,12,2.666667,3.0,3.666667,19,3.0,...,12.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Bulky (7 wpi),3.00,910.0,0.633984,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73991,884801,JederMANNs Kurt,jedermanns-kurt,166,14,3.250000,4.0,4.800000,23,5.0,...,24.00,4.0,False,True,"['pullover', 'sweater', 'clothing']",Sport (12 wpi),6.00,8913.0,32.855066,132839.0
52146,532635,Pathway Home,pathway-home,138,13,1.750000,4.0,4.200000,20,5.0,...,57.00,1.0,True,True,"['beanie-toque', 'hat', 'accessories']",Worsted (9 wpi),57.00,142.0,32.951107,132840.0
15871,1729,Infant Kimono,infant-kimono,26,11,1.625000,8.0,3.714286,5,7.0,...,99.99,1.0,False,False,"['cardigan', 'sweater', 'clothing']",Aran / Worsted,99.99,250.0,59.002514,132841.0
92541,429626,The Eagle has Landed,the-eagle-has-landed,215,40,1.562500,16.0,4.882353,33,17.0,...,99.99,1.0,True,True,"['washcloth', 'cleaning', 'home']",Worsted (9 wpi),99.99,94.0,59.052415,132842.0


Okay - This prediction didn't do so bad; these patterns have the loose chunky knit feel - which is great, but missing the "fun" cat vibe. 

### b) Cosine Similarity

Similarity can be calculated by cosine of the angle between 2 vectors.  Cosine similarity scale is between 0 and 1, 1 being colinear(identical tastes), and 0 being orthogonal (no similarity, or commonality - they are independent), and -1 opposite tastes.  

In [44]:
X.shape
X_smaller = X[0:60000,:] #------------------note will need to rerun this in the cloud! and save cosine_sim as a pickle?
X_smaller.shape

(60000, 61)

In [None]:
cosine_sim = cosine_similarity(X_smaller)

In [35]:
name_permalink_to_use='jasmine-the-giraffe'
pattern_index = get_index_from_name_permalink(name_permalink_to_use, df)
similar_patterns = list(enumerate(cosine_sim[pattern_index]))

In [36]:
sorted_similar_patterns  = sorted(similar_patterns, key=lambda x:x[1], reverse=True)
closest_indices = sorted_similar_patterns[:20]

In [42]:
# give recomendations for the pattern selected
pattern_ids = []
ordered_indices = []
for i in range(len(closest_indices)):
    ordered_indices.append(closest_indices[i][0])
    if i == 0:
        print('Recommendations for similar patterns to {0} {1}:\n'.format(closest_indices[i][0], df.iloc[closest_indices[i][0]]['name_permalink']))
    print(f"{i}: {df.iloc[closest_indices[i][0]]['name_permalink']} with distance of: {closest_indices[i][1]}")

closest_df = df.iloc[ordered_indices]
closest_df.head(10) 

Recommendations for similar patterns to 38902 jasmine-the-giraffe:

0: jasmine-the-giraffe with distance of: 0.9999999999999997
1: night-owl-2 with distance of: 0.9870087323513952
2: cyber-sam with distance of: 0.9870087323513952
3: bunny-babes--beach-boys-part-2 with distance of: 0.9869439781458962
4: tooth-trader with distance of: 0.9868051991658844
5: snuggly-sox with distance of: 0.9866086835808155
6: seahorse with distance of: 0.986397512109007
7: im-canadien-eh with distance of: 0.9863314619140071
8: nudibranch with distance of: 0.9857610703534415
9: baby-buggies with distance of: 0.9855253254320294
10: mr-roboto-5 with distance of: 0.9852862749727247
11: bridal-bunnies with distance of: 0.9852022185149579
12: howdy-pardner with distance of: 0.9846878788511405
13: monkey-13 with distance of: 0.9846579342533351
14: jellyfish-2 with distance of: 0.9834394426918109
15: lola-kitty-kat with distance of: 0.9832542596298476
16: harvest-mice with distance of: 0.9829311187291669
17: eleph

Unnamed: 0,pattern_id,name,name_permalink,favorites_count,projects_count,difficulty_average,difficulty_count,rating_average,queued_projects_count,rating_count,...,gauge,gauge_divisor,free,downloadable,categories,yarn_weight_description,gauge_per_inch,yardage_avg,distances,rank
38902,207707,Jasmine the Giraffe,jasmine-the-giraffe,967,34,4.0,11.0,4.363636,114,11.0,...,25.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",DK (11 wpi),6.25,,5.965051,19.0
9583,194839,Night Owl,night-owl-2,1238,174,4.0,75.0,4.486486,161,74.0,...,26.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.066636,103.0
32524,124403,Cyber Sam,cyber-sam,119,31,4.0,22.0,4.318182,13,22.0,...,26.0,4.0,False,True,"['doll', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.066636,103.0
27047,63723,"Bunny Babes & Beach Boys, Part 2",bunny-babes--beach-boys-part-2,99,23,3.9,10.0,4.3,12,10.0,...,26.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.053083,124.0
27680,70660,Tooth Trader,tooth-trader,274,61,4.133333,30.0,4.478261,50,23.0,...,26.0,4.0,False,True,"['doll', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.086592,84.0
35710,161605,Snuggly Sox,snuggly-sox,402,55,3.777778,27.0,4.458333,63,24.0,...,26.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.038179,176.0
20984,20171,Seahorse,seahorse,1517,128,4.880597,67.0,4.453125,269,64.0,...,6.0,1.0,False,True,"['animal', 'softies', 'toysandhobbies']",Worsted (9 wpi),6.0,,5.184655,89.0
32392,122447,"I'm Canadien, eh!",im-canadien-eh,415,12,3.111111,9.0,4.166667,47,6.0,...,22.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",Worsted (9 wpi),5.5,,5.130982,1136.0
7747,25999,Nudibranch,nudibranch,1244,101,4.702128,47.0,4.225,213,40.0,...,6.0,1.0,False,True,"['animal', 'softies', 'toysandhobbies']",Worsted (9 wpi),6.0,150.0,5.211893,177.0
32760,127032,Baby Buggies,baby-buggies,79,38,3.588235,17.0,4.294118,15,17.0,...,26.0,4.0,False,True,"['animal', 'softies', 'toysandhobbies']",DK (11 wpi),6.5,,6.018706,262.0


And try the sweater like above to compare:

In [43]:
name_permalink_to_use='professor-meow-sweater'
pattern_index = get_index_from_name_permalink(name_permalink_to_use, df)
similar_patterns = list(enumerate(cosine_sim[pattern_index]))

IndexError: index 56673 is out of bounds for axis 0 with size 50000

In [None]:
sorted_similar_patterns  = sorted(similar_patterns, key=lambda x:x[1], reverse=True)
closest_indices = sorted_similar_patterns[:20]

In [None]:
# give recomendations for the pattern selected
pattern_ids = []
ordered_indices = []
for i in range(len(closest_indices)):
    ordered_indices.append(closest_indices[i][0])
    if i == 0:
        print('Recommendations for similar patterns to {0} {1}:\n'.format(closest_indices[i][0], df.iloc[closest_indices[i][0]]['name_permalink']))
    print(f"{i}: {df.iloc[closest_indices[i][0]]['name_permalink']} with distance of: {closest_indices[i][1]}")

closest_df = df.iloc[ordered_indices]
closest_df.head(10)