In [0]:
# Imports
import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [0]:
# Create a blank Tokenizer with just English
nlp=English()
tokenizer = Tokenizer(nlp.vocab)

In [0]:
# Load initial cannabis data and disease data
df = pd.read_csv("https://raw.githubusercontent.com/JimKing100/strains-live/master/data/cannabis.csv")

In [4]:
# View the cannabis data
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [0]:
# Drop the 46 Nans
df = df.dropna()
df = df.reset_index(drop=True)

In [0]:
# Combine the Effects and Flavors in one column
df['Criteria'] = df['Effects'] + ',' + df['Flavor']

In [0]:
# Instantiate vecorizer object - call tokenize
tf = TfidfVectorizer(stop_words='english')

In [8]:
# Create a data-term matrix (vocabulary) and get word counts 
dtm = tf.fit_transform(df['Criteria'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
dtm.head()

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,chestnut,citrus,coffee,creative,diesel,dry,earthy,energetic,euphoric,flowery,focused,fruit,giggly,grape,grapefruit,happy,herbal,honey,hungry,lavender,lemon,lime,mango,menthol,mint,minty,mouth,nutty,orange,peach,pear,pepper,pine,pineapple,plum,pungent,relaxed,rose,sage,skunk,sleepy,spicy,strawberry,sweet,talkative,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423351,0.0,0.367452,0.0,0.0,0.296013,0.393262,0.231844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221346,0.0,0.0,0.0,0.0,0.0,0.0,0.304329,0.0,0.0,0.0,0.498108,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.363913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.227118,0.343958,0.0,0.0,0.243071,0.0,0.332218,0.0,0.0,0.0,0.0,0.0,0.128892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702064,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252281,0.0,0.0,0.0,0.270001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143172,0.38813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151969,0.0,0.591152,0.0,0.0,0.38813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168997,0.0,0.0,0.374899
3,0.0,0.0,0.659475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250226,0.0,0.217187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511474,0.0,0.0,0.0,0.262188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294412,0.0,0.0,0.0,0.145488,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374898,0.0,0.0,0.0,0.0,0.262134,0.0,0.205309,0.0,0.0,0.0,0.0,0.0,0.0,0.184666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.434043,0.0,0.0,0.0,0.0,0.0,0.0,0.217976,0.0,0.0,0.0


In [9]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [0]:
# Create the test case
ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']

In [0]:
# Query for similar strains using the test case
new = tf.transform(ideal_strain)
results = nn.kneighbors(new.todense())

In [12]:
# Results are returned in a tuple of arrays
results

(array([[0.        , 0.65960709, 0.66413197, 0.70212453, 0.70856062]]),
 array([[   0, 1972,  172,   81, 1256]]))

In [13]:
# Pull the strain name from 1st value (0) of the 1st array (0) of the 2nd tuple (1) - the 0 index
df['Strain'][results[1][0][0]]

'100-Og'

In [14]:
# Pull the criteria from 1st value (0) of the 1st array (0) of the 2nd tuple (1) - the 0 index
df['Criteria'][results[1][0][0]]

'Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus'

In [15]:
# Pull the strain name from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Strain'][results[1][0][1]]

'Sunburn'

In [16]:
# Pull the criteria from 2nd value (1) of the 1st array (0) of the 2nd tuple (1) - the 1972 index
df['Criteria'][results[1][0][1]]

'Creative,Euphoric,Uplifted,Happy,Energetic,Citrus,Earthy,Sweet'

In [0]:
# Pickle the dtm and tf for use in the prediction
pickle.dump(dtm, open('/content/dtm.pkl', 'wb'))
pickle.dump(tf, open('/content/tf.pkl', 'wb'))