In [1]:

# Path and File Libraries
import os
import pickle

# Data Transformation Libraries
import pandas as pd
import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


## load data

In [3]:
url='https://raw.githubusercontent.com/DNason1999/simple_repository/master/df_merged.csv'
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0,Strain,Type,Rating,Description,flavors,positive,negative,medical
0,Afpak,hybrid,4.2,"Afpak, named for its direct Afghani and Pakist...","['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '..."
1,African,sativa,3.9,African refers to the indigenous varieties of ...,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe..."
2,Afternoon Delight,hybrid,4.8,"Afternoon Delight, created by Colorado Seed In...","['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '..."
3,Afwreck,hybrid,4.2,Afwreck is a hybrid cross of Afghani and Train...,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea..."
4,Agent Orange,hybrid,4.2,Don’t let the name scare you! The only herbici...,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He..."


In [5]:
## cleaning up data

In [6]:
df['combined_text'] = df.Strain + ' ' + df.Type + ' ' + df.flavors + ' ' + df.Description + ' ' + df.positive + ' ' +    df.negative + ' ' + df.medical
# Removing punctuations from our string
df["combined_text"] = df['combined_text'].str.replace('[^\w\s]',' ')
        
# Creating an index
df.reset_index(level=0, inplace=True)
        
for desc in df['combined_text']:
    if desc == 'None':
       desc = np.nan
        
df = df.dropna()

In [7]:
df.shape

(1473, 10)

In [8]:
df['combined_text'].head()

0    Afpak hybrid   Earthy    Chemical    Pine    S...
1    African sativa   Spicy Herbal    Pungent    Ea...
2    Afternoon Delight hybrid   Pepper    Flowery  ...
3    Afwreck hybrid   Pine    Earthy    Flowery    ...
4    Agent Orange hybrid   Citrus    Orange    Swee...
Name: combined_text, dtype: object

## split data as features and target

In [9]:
# We set our features as description, and target as strain.  
# Create a mass text.

features = ['combined_text'] # expanding the features medical + flavors
target = 'Strain'

X = df[features]
y = df[[target]]

In [11]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [12]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [13]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas
    

## transform and fit

In [15]:
text = df["combined_text"]

a = get_lemmas

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())




In [16]:
# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(1473, 663)


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,depression,depression.1,dizzy,dry,insomnia,pain,anxious,...,woody,woody.1,woody.2,x,yield,yield.1,Unnamed: 18,euphoric,relax,relaxed
0,0.087861,0.0,0.0,0.0,0.051514,0.068274,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.151819,0.0,0.0,0.0,0.059343,0.0,0.084843,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.091437,0.0,0.0,0.0,0.053611,0.071053,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.101672,0.0,0.0,0.120963
3,0.113882,0.0,0.0,0.0,0.0,0.088495,0.0,0.0,0.193981,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.076075,0.0,0.0,0.0,0.044604,0.059116,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Fit on TF-IDF Vectors
size = 5
nn  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [18]:
user_input = ["""I have insominia and need something to help me fall asleep"""]

vec_user_input = tfidf.transform(user_input)
dist, strain_index = nn.kneighbors(vec_user_input.todense())
        


In [19]:
print(strain_index)

[[ 867  146 1217  997  642]]


In [20]:
#recommended_strains = [df[['Strain','Type','flavors','medical']].iloc[n] for n in strain_index]

#print(recommended_strains)  

In [21]:
df[['Strain','Type','flavors','medical']].iloc[867]

Strain                                            Madagascar
Type                                                  indica
flavors                         ['Earthy', 'Skunk', 'Sweet']
medical    ['Insomnia', 'Pain', 'Stress', 'Headaches', 'M...
Name: 876, dtype: object

## create a pickle

In [None]:
# Export Pickle File
filename = 'ball_tree_1.pkl'
pickle.dump(nn, open(filename, 'wb'))

In [None]:
# Export Pickle File
filename = 'nn_1.pkl'
pickle.dump(nn, open(filename, 'wb'))

In [None]:
# Export Pickle File
filename = 'vect_1.pkl'
pickle.dump(dtm, open(filename, 'wb'))

In [None]:
# Export Pickle File
filename = 'tfidf_1.pkl'
pickle.dump(tfidf, open(filename, 'wb'))

In [None]:
import pickle
def get_lemmas(self, text):
        """Return the Lemmas"""
        
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [None]:
nn_pkl = pickle.load(open("./StrainAPI/models/nn_1.pkl", "rb"))
tfidf_pkl = pickle.load(open("./StrainAPI/models/tfidf_1.pkl", "rb"))

In [None]:
user_input = ["I have insominia and need something to help me fall asleep"]
size = 3
vec_user_input = tfidf_pkl.transform(user_input)
dist, strain_index = nn_pkl.kneighbors(vec_user_input.todense(),n_neighbors=size)


In [None]:
print(strain_index)