In [25]:
from __future__ import annotations
import pandas as pd
import numpy as np
import spacy

In [26]:
# load in en_core_web_sm model
nlp = spacy.load('../model')

# load in cannabis data
df: pd.DataFrame = pd.read_csv('../data/cannabis.csv')

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Strain       2351 non-null   object 
 1   Type         2351 non-null   object 
 2   Rating       2351 non-null   float64
 3   Effects      2351 non-null   object 
 4   Flavor       2305 non-null   object 
 5   Description  2318 non-null   object 
dtypes: float64(1), object(5)
memory usage: 110.3+ KB


In [28]:
def tokenize(s: str) -> list[str]:
    return [
        token.lemma_ for token in nlp(s)
            if not token.is_punct
            if not token.is_space
            if not token.is_stop
    ]

def tokenize_col(col: str) -> None:
    train[col] = (
        df[col]
            .astype(str)
            .apply(tokenize)
    )
    return

# create a new dataframe to store tokenized fields
train = df[['Strain']].copy()
    
for c in ['Flavor', 'Description', 'Type', 'Effects']:
    tokenize_col(c)

In [29]:
def tokenize_more(s: str) -> list[str]:
    return [
        token.lemma_ for token in nlp(s)
            if not token.is_punct
            if not token.is_space
            if not token.is_stop
            if token.pos_ in ['ADJ', 'VERB', 'NOUN']
    ]

# Use alternative tokenization on description
# this will only include nouns, adjectives, and verbs
train['Description_'] = (
    df['Description']
        .astype(str)
        .apply(tokenize_more)
)

In [30]:
# example of what the tokenization looks like
train['Description_'].iloc[100], train['Effects'].iloc[100]

(['american',
  'indica',
  'dominant',
  'strain',
  'blend',
  'jamaican',
  'hawaiian',
  'genetic',
  'charge',
  'meaning',
  'indica',
  'represent',
  'honest',
  'toil',
  'personal',
  'improvement',
  'enjoy',
  'fruit',
  'labor',
  'put',
  'skunky',
  'flavor',
  'help',
  'pursue',
  'happiness',
  'balanced',
  'uplifting',
  'social',
  'effect',
  'appropriate',
  'day',
  'evening',
  'use'],
 ['focus', 'Euphoric', 'tingly', 'energetic', 'relaxed'])

In [31]:
train['Lemmas'] = (
      train['Flavor'] 
    # + train['Description']
    + train['Description_']
    + train['Effects']
    + train['Type']
)

train['Lemmas'].iloc[0]

['Earthy',
 'Sweet',
 'Citrus',
 'og',
 'hybrid',
 'strain',
 'pack',
 'strong',
 'punch',
 'refer',
 'strength',
 'high',
 'price',
 'start',
 'show',
 'plant',
 'og',
 'tend',
 'produce',
 'large',
 'dark',
 'green',
 'bud',
 'stem',
 'user',
 'report',
 'strong',
 'body',
 'effect',
 'indica',
 'pain',
 'relief',
 'alert',
 'cerebral',
 'feel',
 'thank',
 'sativa',
 'creative',
 'energetic',
 'Tingly',
 'Euphoric',
 'Relaxed',
 'hybrid']

In [32]:
def vectorize(t: list[str]) -> list[float]:
    return nlp(" ".join(t)).vector

# turn the lemmas into vector form
train['Vectors'] = train['Lemmas'].apply(vectorize)

In [33]:
# now turn the vectors into a dataframe matrix
col_nums = train['Vectors'][0].shape[0]
as_list = list(train['Vectors'].values)

# this will let us easily use the vectors in a model
X = pd.DataFrame(as_list, columns=range(col_nums))

In [34]:
X.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,1.746879,1.35692,0.592955,-0.848038,1.752234,0.171649,-0.973718,0.881931,-1.215223,1.673004,...,0.497011,0.046771,0.989567,-0.62327,0.639326,-0.262327,-1.053344,0.226037,0.259191,0.898924
1,1.757072,1.648197,0.828961,-0.399773,1.977969,0.143871,-0.307127,0.811944,-1.378438,2.049792,...,0.07639,-0.322173,1.088978,-0.233739,0.206538,0.001452,-0.548018,-0.027551,-0.340776,0.662717
2,1.97259,0.876835,0.536456,-0.56239,2.514453,-0.119602,-0.691016,0.810648,-1.344577,1.991738,...,0.625878,-0.483538,1.087123,-0.746523,0.189003,0.011752,-0.442292,-0.060571,-0.504641,0.711542
3,1.99352,1.355428,0.872156,-0.705299,1.981034,0.063621,-0.638667,1.049635,-0.624738,1.661174,...,0.259322,-0.191294,1.035414,-0.365564,0.493251,0.004212,-0.853548,-0.343372,-0.193963,0.553127
4,1.505685,1.456757,0.430312,-0.605696,1.849197,-0.073539,-0.865339,0.587997,-1.440432,1.651082,...,0.661791,-0.193317,0.943361,-0.26831,0.948093,-0.020193,-0.593853,-0.346339,-0.231206,0.558517


# Nearest Neighbors Classifier

In [35]:
from sklearn.neighbors import KNeighborsClassifier

nn = KNeighborsClassifier(n_neighbors=4,
                          weights='distance',
                          algorithm='ball_tree')

nn.fit(X, train['Strain'])

KNeighborsClassifier(algorithm='ball_tree', n_neighbors=4, weights='distance')

In [36]:
nn.score(X, train['Strain'])

0.9991492981709911

In [37]:
def predict(desc: list[str]) -> pd.DataFrame:
    if len(desc) == 0:
        return 'N/A'
    
    vector = nlp(" ".join(desc)).vector.reshape(1, -1)

    result = nn.kneighbors(vector, 5)
    
    return df.iloc[result[1][0]]

In [50]:
example = tokenize(
    """I want something that'll make me sleepy and calm my anxiety. I like citrus flavors."""
)

# prints the top five nearest neighbors of the above example in vector form...
predict(example)

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
1615,Pink-Kush,hybrid,4.5,"Relaxed,Happy,Sleepy,Euphoric,Hungry","Sweet,Earthy,Flowery","Pink Kush, as coveted as its OG Kush relative,..."
268,Blucifer,sativa,4.3,"Happy,Uplifted,Relaxed,Talkative,Focused","Blueberry,Spicy/Herbal,Berry",Created in 2013 by Terraform Genetics who cros...
107,Amnesia-Haze,sativa,4.3,"Happy,Euphoric,Uplifted,Energetic,Creative","Earthy,Citrus,Lemon","With earthy flavors of lemons and citrus, Amne..."
1666,Purple-Arrow,hybrid,4.2,"Happy,Uplifted,Focused,Energetic,Relaxed","Sweet,Citrus,Earthy","When it comes to knocking out pain, no medical..."
1228,La-Nina,sativa,4.7,"Uplifted,Creative,Energetic,Talkative,Euphoric","Sweet,Earthy","La Niña, a close relative of El Niño, is a sat..."


# Let's try a different approach...

In [39]:
# reset
df = pd.read_csv('../data/cannabis.csv')

df['Flavor'] = df['Flavor'].replace('None', np.nan)
df['Effects'] = df['Effects'].replace('None', np.nan)

df = df.dropna(subset=['Flavor', 'Effects'])

In [40]:
effectss = df['Effects'].astype(str).apply(lambda s: s.split(','))
effects = [
    effect 
    for effects in effectss
    for effect in effects
]

effects = set(effects)

flavorss = df['Flavor'].astype(str).apply(lambda s: s.split(','))
flavors = [
    flavor
    for flavors in flavorss
    for flavor in flavors
]

flavors = set(flavors)

types = df['Type'].values
types = set(types)

In [41]:
# One hot encode...

cols = (
    flavors
        .union(effects)
        .union(types)
)

pd.DataFrame(columns=cols)

Unnamed: 0,Nutty,Apple,Minty,Relaxed,Menthol,Honey,Orange,Blueberry,Aroused,indica,...,Sage,Happy,Berry,Tree,Hungry,Violet,Tropical,sativa,Citrus,Strawberry


In [42]:
flavors

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Blue',
 'Blueberry',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Coffee',
 'Diesel',
 'Earthy',
 'Flowery',
 'Fruit',
 'Grape',
 'Grapefruit',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 'Nutty',
 'Orange',
 'Peach',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy/Herbal',
 'Strawberry',
 'Sweet',
 'Tar',
 'Tea',
 'Tobacco',
 'Tree',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}

In [43]:
X.to_csv('X.csv', index=False, float_format='%.4f')

In [44]:
len(effects)

15

In [45]:
len(flavors)

49