## **Imports**

In [1]:
""" Import Statements """

# Classics
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import squarify

import re
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('../data/cannabis.csv')

#change from combined_cannabis.csv to cannabis.csv

In [3]:
print('Shape:', df.shape)
df.head()

Shape: (2155, 8)


Unnamed: 0,Index,Name,Type,Rating,Effects,Description,Flavors,Nearest
0,0,Kelly Hill Gold,Indica,5.0,"Happy,Energetic,Euphoric,Talkative,Aroused",Cultivated by Joseph Arthur Botanicals in Col...,"Pepper,Earthy,Coffee",9928974391841877
1,1,Spyder Mon,Hybrid,5.0,"Uplifted,Creative,Focused,Happy,Relaxed",Spyder Mon is an uplifting CBD strain with a g...,"Citrus,Earthy,Sweet",2181571627223742
2,2,Mochi,Hybrid,5.0,"Sleepy,Happy,Hungry,Relaxed,Tingly",Mochi by Sherbinski is another strain that lea...,"Pungent,Minty,Flowery",614457208705130
3,3,Molokai Purpz,Indica,5.0,"Aroused,Creative,Euphoric,Relaxed,Sleepy",Moloka'i Purpz is a luscious Hawaiian landrace...,"Berry,Grape,Sweet",1478562106350390
4,4,Monolith,Indica,5.0,"Relaxed,Sleepy,Tingly,Euphoric,Focused",Monolith is an indica-dominant strain with Afg...,"Pungent,Earthy,Pine",13978621094214141


### spaCy s

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
STOP_WORDS = nlp.Defaults.stop_words.union({"$", '-', '', ' ',
                                            'bred','breed', 'breeds','call', 'calls',
                                            'combine', 'combines','consumer', 'contains','containing',
                                            "don't", 'effect', 'effects','especially','explanations',
                                            'flavor', 'flavors','flower','give', 'gives','got',  'high',
                                            'i', "i'm", "i've",'including','it.', "it's",
                                            'like', 'match', 'matches','making',
                                            'offer', 'offers','pack', 'packs','price', 'probably',
                                            'produce', 'produces', 'really',
                                            'refers', 'report', 'reports', "'s", 's',
                                            'seed', 'seeds','showing',
                                            'smell','start', 'started', 'stem', 'stems',
                                            'strain', 'strains','supposedly',
                                            'technique', 'techniques','tend', 'tends',
                                            'unavailable', 'unkown', 'user', 'users', 'utlizing',
                                            'weed', 'week', None})

In [6]:
def tokenize(text):
  doc = nlp(text) #casting as text
  return [token.lemma_.strip() for token in doc if not token.is_stop and not token.is_punct]

# ML Model

## Vectorization

### TFIDF

In [7]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,3),
                        max_df=.97,
                        min_df=3,
                        tokenizer=tokenize,
                        max_features = 5000)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(df['Type'] + df['Description'] + df['Effects'] + df['Flavors']) # Similiar to fit_predict

# Print word counts

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,1,10,10 11,10 11 week,10 week,10 week flower,10 week produce,100,11,11 week,...,yield indoor,yield plant,yield potency,yield thc,yields.happy,yoda,zest,zesty,zesty lemon,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cosine Similarity Model

In [31]:
user_input = ['big buds']

In [32]:
user_dtm = pd.DataFrame(tfidf.transform(user_input).todense(), columns=tfidf.get_feature_names())

In [33]:
rec_dtm = dtm.append(user_dtm).reset_index(drop=True)

In [34]:
rec_dtm

Unnamed: 0,1,10,10 11,10 11 week,10 week,10 week flower,10 week produce,100,11,11 week,...,yield indoor,yield plant,yield potency,yield thc,yields.happy,yoda,zest,zesty,zesty lemon,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
cosine_df = pd.DataFrame(cosine_similarity(rec_dtm))

In [39]:
recommendations = cosine_df[cosine_df[0] < 1][0].sort_values(ascending=False)[:5]

In [40]:
recommendations

992     0.332275
439     0.192266
184     0.173853
1117    0.169363
45      0.156640
Name: 0, dtype: float64

In [15]:
df.iloc[992]

Index                                                        992
Name                                               Acapulco Gold
Type                                                      Sativa
Rating                                                      3.86
Effects                Happy,Uplifted,Euphoric,Energetic,Relaxed
Description    One of the most well-known strains, Acapulco G...
Flavors                                     Earthy,Sweet,Pungent
Nearest                                       0,961,1201,984,439
Name: 992, dtype: object

In [16]:
df.iloc[439]

Index                                                        439
Name                                                  White Gold
Type                                                      Hybrid
Rating                                                      4.32
Effects                Relaxed,Happy,Euphoric,Talkative,Uplifted
Description    White Gold from Sensi Seeds is an indica-domin...
Flavors                                  Earthy,Pungent,Tropical
Nearest                                  1413,1935,992,1723,1795
Name: 439, dtype: object

In [17]:
df.iloc[184]

Index                                                        184
Name                                         Orange Hill Special
Type                                                      Hybrid
Rating                                                      4.77
Effects                    Happy,Relaxed,Sleepy,Euphoric,Focused
Description    Orange Hill Special is a select cross from the...
Flavors                                      Orange,Citrus,Sweet
Nearest                                 1634,1512,2149,1658,1887
Name: 184, dtype: object

In [18]:
df.iloc[1117]

Index                                                       1117
Name                                                  Chemdawg 4
Type                                                      Hybrid
Rating                                                      3.64
Effects                   Relaxed,Euphoric,Happy,Hungry,Uplifted
Description    Part of the prominent line of Chemdawg strains...
Flavors                                        Pine,Diesel,Lemon
Nearest                                   948,1523,897,1723,1674
Name: 1117, dtype: object

In [19]:
df.iloc[45]

Index                                                         45
Name                                                    Ape Shit
Type                                                      Hybrid
Rating                                                         5
Effects                  Happy,Relaxed,Uplifted,Giggly,Talkative
Description    Ape Shit is the genetic offspring of three qua...
Flavors                                     Sweet,Grape,Lavender
Nearest                                     381,992,1771,49,1066
Name: 45, dtype: object

# Functions

In [70]:
# Create cosine_similarity function

import json

def cosine_recommender(user_input):
    user_dtm = pd.DataFrame(tfidf.transform(user_input).todense(), columns=tfidf.get_feature_names())
    rec_dtm = dtm.append(user_dtm).reset_index(drop=True)
    cosine_df = pd.DataFrame(cosine_similarity(rec_dtm))
    longdf = len(cosine_df)-1
    print("Len df -1", longdf)
    values1 = cosine_df[cosine_df[0]<1][len(cosine_df)-1]
    print('values', values1)
    recommendations = cosine_df[cosine_df[0] < 1][len(cosine_df)-1].sort_values(ascending=False)[1:6]
    return recommendations

#Does not print all 5 results? 

In [71]:
cosine_recommender(user_input)

Len df -1 2155
values 1       0.000000
2       0.015447
3       0.000000
4       0.000000
5       0.000000
          ...   
2151    0.000000
2152    0.000000
2153    0.000000
2154    0.000000
2155    1.000000
Name: 2155, Length: 2155, dtype: float64


1978    0.747218
861     0.498436
2078    0.431750
39      0.428138
75      0.307618
Name: 2155, dtype: float64

In [57]:
user_input2 = ['energetic']

In [58]:
cosine_recommender(user_input2)

1460    0.186523
1529    0.178320
1182    0.175643
1817    0.173626
1487    0.168550
Name: 2155, dtype: float64

## Pickle Model

In [None]:
# Create pickles of the model and the transformer for web deployment

# # Imports
import pickle

pickle.dump(tfidf, open('tfidf.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(dtm, open('dtm.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
user_input1 =["I am feeling sluggish. I am looking for an ammonia flavored strain that will have me feeling happy and energetic"]

In [None]:
#Loading the pickled models
tfidf_model = pickle.load(open('tfidf.pkl', 'rb'))
dtm_model = pickle.load(open('dtm.pkl', 'rb'))

In [None]:
tfidf_model

In [None]:
dtm_model

## Test

In [None]:
# Create cosine_similarity function

import json

def cosine_recommender(user_input):
    
    user_dtm1 = pd.DataFrame(tfidf_model.transform(user_input).todense(), columns=tfidf_model.get_feature_names())
    rec_dtm1 = dtm_model.append(user_dtm1).reset_index(drop=True)
    
    cosine_df1 = pd.DataFrame(cosine_similarity(rec_dtm1))
    
    recommendations5 = (cosine_df1[cosine_df1[0] < 1][[len(cosine_df)-1]].sort_values(ascending=False)[:5]).reset_index()
    
    print(recommendations5)

#Does not print all 5 results? 

In [None]:
cosine_recommender(user_input1)