In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re
import MonkeyScope

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
data = pd.read_csv("cannabis-working-2.csv")

In [4]:
data['Effects'] = data['Effects'].apply(lambda s: 'Relaxed,Creative,Aroused' if s == 'Dry Mouth' else s)

In [5]:
data = data.rename(columns={'Strain': 'Name'})

In [6]:
data['Type'] = data['Type'].str.title()

In [7]:
data['Flavors'] = data['Flavors'].str.replace('/', ',')

In [8]:
def fix_string(string: str) -> str:
    return string.replace(
        '\u2018', "'",
    ).replace(
        '\u2019', "'",
    ).replace(
        '\u201c', "'",
    ).replace(
        '\u201d', "'",
    ).replace(
        '\u00f1', "n",
    ).replace(
        '\u2013', "-",
    ).replace(
        '\u2014', "-",
    ).replace(
        '\u014d', "o",
    ).replace(
        '\u2026', '-',
    ).replace(
        '\u0101', 'a',
    )

In [10]:
data['Name'] = data['Name'].apply(fix_string)
data['Description'] = data['Description'].apply(fix_string)

In [11]:
def tokenize(document):
    doc = nlp(document)
    return [
        token.lemma_.strip() for token in doc 
        if not token.is_stop and not token.is_punct
    ]

In [12]:
tfidf = TfidfVectorizer(
    stop_words='english',
    tokenizer=tokenize,
    ngram_range=(1, 3),
    max_df=.97,
    min_df=3,
)
dtm = tfidf.fit_transform(data['Description'] + data['Effects'] + data['Flavors'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [13]:
dtm.head()

Unnamed: 0,09,1,1 afghani,1 genetic,1 hybrid,1 know,1 northern,1 northern light,1 strain,10,...,zealand,zest,zestful,zesty,zesty lemon,zesty lemon aroma,zion,zombie,zombie og,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=8, n_neighbors=6, p=2, radius=1.0)

In [15]:
def nearest(idx):
    return ','.join(map(str, nn.kneighbors([dtm.iloc[idx]])[1][0][1:].tolist()))

In [16]:
nearest(440)

'896,803,705,730,1338'

In [17]:
data['Nearest'] = data['Index'].apply(nearest)

In [18]:
data.head()

Unnamed: 0,Index,Name,Type,Rating,Effects,Description,Flavors,Nearest
0,0,Kelly Hill Gold,Indica,5.0,"Happy,Energetic,Euphoric,Talkative,Aroused",Cultivated by Joseph Arthur Botanicals in Colo...,"Pepper,Earthy,Coffee",9928974391841877
1,1,Spyder Mon,Hybrid,5.0,"Uplifted,Creative,Focused,Happy,Relaxed",Spyder Mon is an uplifting CBD strain with a g...,"Citrus,Earthy,Sweet",2181571627223742
2,2,Mochi,Hybrid,5.0,"Sleepy,Happy,Hungry,Relaxed,Tingly",Mochi by Sherbinski is another strain that lea...,"Pungent,Minty,Flowery",614457208705130
3,3,Molokai Purpz,Indica,5.0,"Aroused,Creative,Euphoric,Relaxed,Sleepy",Moloka'i Purpz is a luscious Hawaiian landrace...,"Berry,Grape,Sweet",1478562106350390
4,4,Monolith,Indica,5.0,"Relaxed,Sleepy,Tingly,Euphoric,Focused",Monolith is an indica-dominant strain with Afg...,"Pungent,Earthy,Pine",13978621094214141


In [19]:
data.tail()

Unnamed: 0,Index,Name,Type,Rating,Effects,Description,Flavors,Nearest
2150,2150,True X,Indica,0.45,"Uplifted,Focused,Sleepy,Relaxed,Tingly",True X by Royal Choice Farms is an indica-domi...,"Earthy,Woody,Pine",13511785271968175
2151,2151,White Strawberry,Hybrid,0.45,Happy,White Strawberry is the flavorful cross of The...,"Citrus,Sweet",652147781975843
2152,2152,Queens Panties,Hybrid,0.45,"Talkative,Happy,Energetic,Tingly,Uplifted",Queen's Panties is a royally stimulating sativ...,"Sweet,Lemon,Citrus",23814782401605498
2153,2153,Avi,Hybrid,0.45,"Relaxed,Focused,Uplifted,Creative,Happy",Avi is a high-CBD strain grown by Canadian LP ...,"Earthy,Pungent",5352556302991207
2154,2154,Blue Bayou,Sativa,0.0,"Giggly,Uplifted,Creative,Happy,Sleepy",Blue Bayou can only be described as a comprehe...,Sweet,135021251184857881


In [20]:
data[data['Name'] == 'Pink Cookies']

Unnamed: 0,Index,Name,Type,Rating,Effects,Description,Flavors,Nearest
896,896,Pink Cookies,Hybrid,3.86,"Relaxed,Aroused,Creative,Giggly,Happy","Pink Cookies, also known as Wedding Cake, is t...","Sweet,Earthy,Pungent",44080370596730


In [21]:
one_nn = NearestNeighbors(n_neighbors=1, algorithm='kd_tree', n_jobs=8)
one_nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=8, n_neighbors=1, p=2, radius=1.0)

In [22]:
def recommender(user_input):
    recommendations = one_nn.kneighbors(tfidf.transform([user_input]).todense())[1]
    return dict(data.loc[recommendations[0][0]])

In [23]:
user_input = 'I am suffering from headaches. Looking for a sweet flavor and being uplifted and energetic.'

In [24]:
recommender(user_input)

{'Index': 1354,
 'Name': 'Lemon Jack',
 'Type': 'Sativa',
 'Rating': 3.64,
 'Effects': 'Focused,Energetic,Happy,Uplifted,Creative',
 'Description': "While Lemon Jack has yet to earn the reputation of its father, Jack Herer, this sativa deserves some respect. A potent strain featuring a distinct, chemical-like lemon smell, Lemon Jack has strong psychoactive effects. Focused and energizing, this strain's effects draw from both its Jack Herer and Lemon Kush heritage. Like a strong cup of coffee, Lemon Jack is a daytime strain. Patients who suffer from headaches and fatigue tend to find relief with this strain, though it may not be the best choice for those who suffer from anxiety.",
 'Flavors': 'Lemon,Citrus,Sweet',
 'Nearest': '1895,425,824,2122,1707'}

In [25]:
data.to_csv("cannabis-5.csv", index=None)