In [113]:
!pip install squarify



In [114]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
import squarify
import en_core_web_sm
import spacy
from spacy.tokenizer import Tokenizer
import re
from collections import Counter 
nlp = en_core_web_sm.load()
tokenizer = Tokenizer(nlp.vocab)


In [115]:
def get_lemmas(text):

    lemmas = []
    # STOP_WORDS = nlp.Defaults.stop_words.union([])

    doc = nlp(text)

    for token in doc: # ignore first two elements containing blank space and date
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON') \
                and (token.is_space == False) and (token.is_digit == False):
            lemmas.append(token.lemma_.lower())

    return lemmas

In [116]:
df = pd.read_csv('/content/drive/My Drive/datasets/cannabis_strains.csv')

In [117]:
df.head()

Unnamed: 0,strain,race,flavors,positive,negative,medical,Type,Rating,Description
0,Afpak,hybrid,"['Earthy', 'Chemical', 'Pine']","['Relaxed', 'Hungry', 'Happy', 'Sleepy']",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '...",hybrid,4.2,"Afpak, named for its direct Afghani and Pakist..."
1,African,sativa,"['Spicy/Herbal', 'Pungent', 'Earthy']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe...",sativa,3.9,African refers to the indigenous varieties of ...
2,Afternoon Delight,hybrid,"['Pepper', 'Flowery', 'Pine']","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '...",hybrid,4.8,"Afternoon Delight, created by Colorado Seed In..."
3,Afwreck,hybrid,"['Pine', 'Earthy', 'Flowery']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea...",hybrid,4.2,Afwreck is a hybrid cross of Afghani and Train...
4,Agent Orange,hybrid,"['Citrus', 'Orange', 'Sweet']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He...",hybrid,4.2,Don’t let the name scare you! The only herbici...


In [118]:
df['Description'][1]

"African refers to\xa0the indigenous varieties of cannabis (or\xa0landraces) that grow natively in this region of the world. Because of this region's latitude and climate, these native landrace strains tend to be\xa0sativa\xa0in structure and effect."

In [119]:
df['medical'][0]

"['Depression', 'Insomnia', 'Pain', 'Stress', 'Lack of Appetite']"

In [120]:
df['combined_text'] = df['positive'] + df['medical']

In [121]:
df['combined_text'][0]

"['Relaxed', 'Hungry', 'Happy', 'Sleepy']['Depression', 'Insomnia', 'Pain', 'Stress', 'Lack of Appetite']"

In [122]:
df['combined_text'] = df['combined_text'].str.replace("'", "")
df['combined_text'] = df['combined_text'].str.replace('\]\[', ', ')
df['combined_text'] = df['combined_text'].str.replace('[', '')
df['combined_text'] = df['combined_text'].str.replace(']', '')

In [123]:
df['combined_text'][0]

'Relaxed, Hungry, Happy, Sleepy, Depression, Insomnia, Pain, Stress, Lack of Appetite'

In [124]:
tokens = []

for doc in tokenizer.pipe(df['combined_text'], batch_size=500):
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)

df['tokens'] = tokens

In [125]:
df['tokens']

0       [Relaxed,, Hungry,, Happy,, Sleepy,, Depressio...
1       [Euphoric,, Happy,, Creative,, Energetic,, Tal...
2       [Relaxed,, Hungry,, Euphoric,, Uplifted,, Ting...
3       [Relaxed,, Happy,, Creative,, Uplifted,, Sleep...
4       [Relaxed,, Euphoric,, Happy,, Energetic,, Upli...
                              ...                        
1490    [Relaxed,, Creative,, Energetic,, Focused,, Gi...
1491    [Relaxed,, Happy,, Energetic,, Uplifted,, Focu...
1492    [Relaxed,, Happy,, Energetic,, Uplifted,, Focu...
1493    [Relaxed,, Euphoric,, Happy,, Uplifted,, Sleep...
1494    [Relaxed,, Hungry,, Happy,, Uplifted,, Giggly,...
Name: tokens, Length: 1495, dtype: object

In [126]:
df['lemas'] = df['combined_text'].apply(get_lemmas)

In [127]:
from collections import Counter

word_counts = Counter()

df['lemas'].apply(lambda x: word_counts.update(x))

word_counts.most_common(50)

[('stress', 1317),
 ('happy', 1266),
 ('depression', 1209),
 ('pain', 1152),
 ('relaxed', 1147),
 ('euphoric', 1071),
 ('uplifted', 904),
 ('insomnia', 634),
 ('creative', 555),
 ('fatigue', 538),
 ('lack', 532),
 ('appetite', 532),
 ('headache', 474),
 ('energetic', 469),
 ('headaches', 428),
 ('sleepy', 395),
 ('focused', 360),
 ('nausea', 319),
 ('hungry', 303),
 ('inflammation', 280),
 ('muscle', 268),
 ('spasms', 268),
 ('giggly', 209),
 ('talkative', 163),
 ('eye', 134),
 ('pressure', 134),
 ('tingly', 129),
 ('cramps', 112),
 ('aroused', 92),
 ('spasticity', 28),
 ('seizures', 12),
 ('arouse', 12),
 ('focus', 2)]

In [128]:
df['lemas'][1]

['euphoric',
 'happy',
 'creative',
 'energetic',
 'talkative',
 'depression',
 'pain',
 'stress',
 'lack',
 'appetite',
 'nausea',
 'headache']

In [None]:
df.to_csv("vectorizer_dataset.csv", index=False)

In [129]:
# Create our model

# Create our model

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

In [131]:
tfidf = TfidfVectorizer(stop_words='english',
                        tokenizer=get_lemmas)

dtm = tfidf.fit_transform(df['combined_text'])
dump(tfidf, 'vectorizer.joblib', compress=True)

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())


In [132]:
dtm

Unnamed: 0,appetite,arouse,aroused,cramp,creative,depression,energetic,euphoric,eye,fatigue,focus,focused,giggly,happy,headache,hungry,inflammation,insomnia,lack,muscle,nausea,pain,pressure,relaxed,seizure,sleepy,spasm,spasticity,stress,talkative,tingly,uplifted
0,0.364114,0.0,0.0,0.000000,0.000000,0.217207,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.208958,0.000000,0.464727,0.0,0.332738,0.364114,0.000000,0.000000,0.225853,0.0,0.226632,0.0,0.417352,0.000000,0.0,0.201887,0.000000,0.00000,0.000000
1,0.306880,0.0,0.0,0.000000,0.300500,0.183065,0.325877,0.201352,0.0,0.000000,0.0,0.000000,0.000000,0.176113,0.249763,0.000000,0.0,0.000000,0.306880,0.000000,0.383931,0.190352,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.170153,0.484883,0.00000,0.000000
2,0.000000,0.0,0.0,0.519541,0.000000,0.175759,0.000000,0.193317,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.239796,0.376048,0.0,0.269245,0.000000,0.000000,0.000000,0.182756,0.0,0.183386,0.0,0.000000,0.000000,0.0,0.163363,0.000000,0.49922,0.217872
3,0.000000,0.0,0.0,0.000000,0.289515,0.000000,0.000000,0.000000,0.0,0.294033,0.0,0.000000,0.000000,0.169675,0.481266,0.000000,0.0,0.000000,0.000000,0.395157,0.000000,0.183393,0.0,0.184026,0.0,0.338892,0.395157,0.0,0.163933,0.000000,0.00000,0.218632
4,0.000000,0.0,0.0,0.000000,0.000000,0.209801,0.373472,0.230760,0.0,0.000000,0.0,0.000000,0.000000,0.201834,0.572483,0.000000,0.0,0.000000,0.000000,0.000000,0.440005,0.218153,0.0,0.218905,0.0,0.000000,0.000000,0.0,0.195004,0.000000,0.00000,0.260070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490,0.000000,0.0,0.0,0.000000,0.316055,0.192541,0.342746,0.000000,0.0,0.000000,0.0,0.401832,0.470711,0.000000,0.000000,0.000000,0.0,0.294953,0.000000,0.000000,0.403806,0.200205,0.0,0.200896,0.0,0.000000,0.000000,0.0,0.178961,0.000000,0.00000,0.000000
1491,0.000000,0.0,0.0,0.595746,0.000000,0.201539,0.358765,0.000000,0.0,0.000000,0.0,0.420613,0.000000,0.193886,0.274969,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.209562,0.0,0.210285,0.0,0.000000,0.000000,0.0,0.187325,0.000000,0.00000,0.249829
1492,0.329390,0.0,0.0,0.000000,0.000000,0.196493,0.349781,0.000000,0.0,0.000000,0.0,0.410080,0.000000,0.189031,0.268084,0.000000,0.0,0.000000,0.329390,0.000000,0.412094,0.204314,0.0,0.205019,0.0,0.000000,0.000000,0.0,0.182634,0.000000,0.00000,0.243573
1493,0.000000,0.0,0.0,0.000000,0.000000,0.223583,0.000000,0.245919,0.0,0.000000,0.0,0.000000,0.000000,0.215093,0.305044,0.000000,0.0,0.342506,0.000000,0.000000,0.468909,0.232483,0.0,0.233285,0.0,0.429604,0.000000,0.0,0.207814,0.000000,0.00000,0.277154


In [133]:
df['medical'][3]

"['Pain', 'Stress', 'Headache', 'Fatigue', 'Headaches', 'Muscle Spasms']"

In [134]:
from sklearn.neighbors import NearestNeighbors
from joblib import dump

model = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
model.fit(dtm)
dump(model, 'strain_recommender.joblib', compress=True)

['strain_recommender.joblib']

In [135]:
def recommendations(text):
    text_transformed = tfidf.transform(text)
    results = model.kneighbors(text_transformed.todense())

    return results

In [136]:
recommendations(['depression, happy, insomnia'])

(array([[0.7872958 , 0.83436716, 0.87748526, 0.92903462, 0.94075118,
         0.94757526, 0.94757526, 0.94757526, 0.95527452, 0.95834636]]),
 array([[1484, 1368,  716, 1132,  894, 1286,  100, 1215,  863, 1107]]))

In [137]:
df.iloc[1484]

strain                                                     Wookies
race                                                        hybrid
flavors                                                  ['Minty']
positive              ['Relaxed', 'Euphoric', 'Happy', 'Uplifted']
negative                                                        []
medical                       ['Depression', 'Insomnia', 'Stress']
Type                                                        hybrid
Rating                                                         4.8
Description      Wookies (not to be confused with the strain “W...
combined_text    Relaxed, Euphoric, Happy, Uplifted, Depression...
tokens           [Relaxed,, Euphoric,, Happy,, Uplifted,, Depre...
lemas            [relaxed, euphoric, happy, uplifted, depressio...
Name: 1484, dtype: object

# Production model

In [138]:
import psycopg2

pg_conn = psycopg2.connect(
    dbname="spkpknid", user="spkpknid",
    password="hZo2oRpUZdu-GU42rkgr9q9MRVEZ_Hee", host="ruby.db.elephantsql.com"
)
pg_curs = pg_conn.cursor()

In [139]:
import en_core_web_sm
nlp = en_core_web_sm.load()

def get_lemmas(text):

    lemmas = []
    # STOP_WORDS = nlp.Defaults.stop_words.union([])

    doc = nlp(text)

    for token in doc: # ignore first two elements containing blank space and date
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON') \
                and (token.is_space == False) and (token.is_digit == False):
            lemmas.append(token.lemma_.lower())

    return lemmas

In [140]:
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import load

def modelservice(features, pg_curs):
    tfidf = load('vectorizer.joblib')
    text_transformed = tfidf.transform(features)

    model = load('strain_recommender.joblib')
    prediction = model.kneighbors(text_transformed.todense())

    print(prediction)

    prediction = tuple(prediction[1][0])

    query = f'''SELECT * FROM strains WHERE index in {prediction} ORDER BY "Rating" DESC'''
    pg_curs.execute(query)
    result = pg_curs.fetchall()

    return result

In [141]:
test = modelservice(features=['depression, happiness, insomnia'], pg_curs=pg_curs)

(array([[0.88229355, 0.91978452, 0.95259214, 0.95463763, 0.99687318,
        1.00655155, 1.01220119, 1.01220119, 1.01220119, 1.01858635]]), array([[1484, 1368, 1213,  716, 1132,  894,  100, 1215, 1286,  863]]))


In [142]:
test

[(716,
  'Ice Princess',
  'hybrid',
  "['Tropical']",
  "['Relaxed', 'Euphoric', 'Happy']",
  "['Dry Mouth', 'Dry Eyes']",
  "['Depression', 'Insomnia', 'Pain', 'Stress', 'Fatigue']",
  'hybrid',
  5.0,
  'Ice Princess by Brothers Grimm Seeds is a hybrid cross between a Cinderella 99 mother and a White Widow father. This combination creates dense, resinous pine tree shaped colas that reek of skunk, spice, and tropical fruit. It fares best indoors and is known to yield larger crops with the addition of extra light. Ice Princess offers consumers cerebral effects similar to Cinderella 99, but with heavier physical potency. \xa0'),
 (1484,
  'Wookies',
  'hybrid',
  "['Minty']",
  "['Relaxed', 'Euphoric', 'Happy', 'Uplifted']",
  '[]',
  "['Depression', 'Insomnia', 'Stress']",
  'hybrid',
  4.8,
  'Wookies (not to be confused with the strain “Wookie” or the enormous, sentient space-bears of Star Wars) is an indica-dominant cross of White 91 (The White x Chemdawg 91) and Girl Scout Cookies

In [143]:
pg_curs.close()
pg_conn.close()