In [2]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re

In [3]:
nlp = spacy.load("en_core_web_lg")

In [49]:
data = pd.read_csv("cannabis-working-2.csv")

In [50]:
data['Effects'] = data['Effects'].apply(lambda s: 'Relaxed,Creative,Aroused' if s == 'Dry Mouth' else s)

In [51]:
data[data['Index'] == 1996]

Unnamed: 0,Index,Strain,Type,Rating,Effects,Description,Flavors,Nearest
1996,1996,Mendocino Madness,hybrid,2.73,"Relaxed,Creative,Aroused",Mendocino Madness is the fastest growing hybri...,"Earthy,Chestnut",199624160880117871854


In [52]:
def tokenize(document):
    doc = nlp(document)
    return [
        token.lemma_.strip() for token in doc 
        if not token.is_stop and not token.is_punct
    ]

In [53]:
tfidf = TfidfVectorizer(
    stop_words='english',
    tokenizer=tokenize,
    ngram_range=(1, 3),
    max_df=.97,
    min_df=3,
)
dtm = tfidf.fit_transform(data['Description'] + data['Effects'] + data['Flavors'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [63]:
dtm.head()

Unnamed: 0,09,1,1 afghani,1 genetic,1 hybrid,1 know,1 northern,1 northern light,1 strain,10,...,zealand,zest,zestful,zesty,zesty lemon,zesty lemon aroma,zion,zombie,zombie og,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=8, n_neighbors=6, p=2, radius=1.0)

In [56]:
def nearest(idx):
    return ','.join(map(str, nn.kneighbors([dtm.iloc[idx]])[1][0].tolist()))

In [57]:
nearest(440)

'440,896,803,705,730,1338'

In [58]:
data['Nearest'] = data['Index'].apply(nearest)

In [59]:
data.head()

Unnamed: 0,Index,Strain,Type,Rating,Effects,Description,Flavors,Nearest
0,0,Kelly Hill Gold,indica,5.0,"Happy,Energetic,Euphoric,Talkative,Aroused",Cultivated by Joseph Arthur Botanicals in Colo...,"Pepper,Earthy,Coffee",9928974391841877
1,1,Spyder Mon,hybrid,5.0,"Uplifted,Creative,Focused,Happy,Relaxed",Spyder Mon is an uplifting CBD strain with a g...,"Citrus,Earthy,Sweet",12181571627223742
2,2,Mochi,hybrid,5.0,"Sleepy,Happy,Hungry,Relaxed,Tingly",Mochi by Sherbinski is another strain that lea...,"Pungent,Minty,Flowery",2614457208705130
3,3,Molokai Purpz,indica,5.0,"Aroused,Creative,Euphoric,Relaxed,Sleepy",Moloka’i Purpz is a luscious Hawaiian landrace...,"Berry,Grape,Sweet",31478562106350390
4,4,Monolith,indica,5.0,"Relaxed,Sleepy,Tingly,Euphoric,Focused",Monolith is an indica-dominant strain with Afg...,"Pungent,Earthy,Pine",413978621094214141


In [60]:
data.tail()

Unnamed: 0,Index,Strain,Type,Rating,Effects,Description,Flavors,Nearest
2150,2150,True X,indica,0.45,"Uplifted,Focused,Sleepy,Relaxed,Tingly",True X by Royal Choice Farms is an indica-domi...,"Earthy,Woody,Pine",215013511785271968175
2151,2151,White Strawberry,hybrid,0.45,Happy,White Strawberry is the flavorful cross of The...,"Citrus,Sweet",2151652147781975843
2152,2152,Queens Panties,hybrid,0.45,"Talkative,Happy,Energetic,Tingly,Uplifted",Queen’s Panties is a royally stimulating sativ...,"Sweet,Lemon,Citrus",215223814782401605498
2153,2153,Avi,hybrid,0.45,"Relaxed,Focused,Uplifted,Creative,Happy",Avi is a high-CBD strain grown by Canadian LP ...,"Earthy,Pungent",21535352556302991207
2154,2154,Blue Bayou,sativa,0.0,"Giggly,Uplifted,Creative,Happy,Sleepy",Blue Bayou can only be described as a comprehe...,Sweet,2154135021251184857881


In [61]:
data.to_csv("cannabis.csv", index=None)