In [0]:
# Imports
import pandas as pd

from pathlib import Path
import pickle
import os

import spacy
from spacy.tokenizer import Tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255078 sha256=fc0e90e7358f1cdc608d0f05b2a20791ea2ffe4cc32aa79c4731ea3bbc4e3b74
  Stored in directory: /tmp/pip-ephem-wheel-cache-f_gs0w7t/wheels/b4/d7/70/426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

OSError: ignored

In [0]:
# Load initial cannabis data and disease data
df = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/cannabis.csv")
disease = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/Disease.csv")

In [0]:
# View the cannabis data
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [0]:
# Drop the 46 Nans
df = df.dropna()
df = df.reset_index(drop=True)

In [0]:
# Combine the Effects and Flavors in one column
df['Criteria'] = df['Effects'] + ',' + df['Flavor']

In [0]:
# Function to use spacy tokenizer
def tokenize(document):    
    doc = nlp(document)   
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and 
                                                     (token.is_punct != True) and
                                                     (token.text != ' ')]

In [0]:
# Instantiate vecorizer object - call tokenize
tf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

In [0]:
# Create a vocabulary and get word counts 
dtm = tf.fit_transform(df['Criteria'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
dtm.head()

Unnamed: 0,ammonia,apple,apricot,arouse,aroused,berry,blue,blueberry,butter,cheese,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.498108,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.370635,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700063,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.168997,0.0,0.0,0.374899
3,0.0,0.0,0.659475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.294412,0.0,0.0,0.0,0.145488,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.217976,0.0,0.0,0.0


In [0]:
# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [0]:
# Create the test case
ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']

In [0]:
# Query for similar strains using the test case
new = tf.transform(ideal_strain)
results = nn.kneighbors(new.todense())

In [0]:
results

(array([[0.        , 0.65960709, 0.66413197, 0.70212453, 0.70856062]]),
 array([[   0, 1972,  172,   81, 1256]]))

In [0]:
df['Strain'][results[1][0][0]]

'100-Og'

In [0]:
df['Criteria'][results[1][0][0]]

'Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus'

In [0]:
df['Strain'][results[1][0][1]]

'Sunburn'

In [0]:
df['Criteria'][results[1][0][0]]

'Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus'

In [0]:
parent_directory = Path().resolve().parent

pickle.dump(dtm, open(os.path.join(parent_directory, 'dtm.pkl'), 'wb'))
pickle.dump(tf, open(os.path.join(parent_directory, 'tf.pkl'), 'wb'))