# MediZen

## DS

- Tobias Reaper
- Vera Mendes
- Alex Gerwer

## ML

- Maxime Vacher-Materno

---

![Kate Russel](../images/kate.png)

![Desktop Landing](../screenshots/Desktop-1-Landing.png)

---

## The DS Problem: Strain Recommendations

In [1]:
# Load and look at the dataset
import pandas as pd
import janitor

datapath = "../../data/cannabis.csv"

df1 = pd.read_csv(datapath)

print(df1.shape)
df1.head()

(2351, 6)


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


![Desktop Filters](../screenshots/Desktop-4-Filters.png)

### Feature Engineering

In [3]:
# User pyjanitor to wrangle the data and engineer that single feature
df2 = (df1
        .clean_names()  # In this case, fixes Title Case
        .concatenate_columns(
            # Create a single feature for NLP analysis
            column_names=["type", "effects", "flavor"],
            new_column_name="type_effects_flavor",
            sep=",",
        )
        .remove_columns(column_names=[
            "rating",
            "description",
            "type",
            "effects",
            "flavor",
        ]))

In [4]:
# Configure pandas to display entire text of column
pd.set_option('max_colwidth', 200)

df2.head()

Unnamed: 0,strain,type_effects_flavor
0,100-Og,"hybrid,Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus"
1,98-White-Widow,"hybrid,Relaxed,Aroused,Creative,Happy,Energetic,Flowery,Violet,Diesel"
2,1024,"sativa,Uplifted,Happy,Relaxed,Energetic,Creative,Spicy/Herbal,Sage,Woody"
3,13-Dawgs,"hybrid,Tingly,Creative,Hungry,Relaxed,Uplifted,Apricot,Citrus,Grapefruit"
4,24K-Gold,"hybrid,Happy,Relaxed,Euphoric,Uplifted,Talkative,Citrus,Earthy,Orange"


### Vectorization with TF-IDF

In [7]:
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object
tfidf = TfidfVectorizer(stop_words="english")

# Create a vocabulary from the new feature
dtm = tfidf.fit(df2["type_effects_flavor"])

# This trained vocabulary is what we want to pickle and use in the app
import pickle
with open("vector_vocab.pkl", "wb") as p:
    pickle.dump(dtm, p)

# Create vectorized version of the concatenated feature
sparse = tfidf.transform(df2["type_effects_flavor"])

# The result is a sparse matrix, which can be converted back to a dataframe
vdtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

vdtm

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.477579,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
1,0.0,0.0,0.000000,0.35866,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.691871,0.000000
2,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.160800,0.0,0.000000,0.358211
3,0.0,0.0,0.645008,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.290903,0.0,0.0,0.0,0.144217,0.0,0.000000,0.000000
4,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.210917,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.234488,0.0,0.000000,0.522364
2347,0.0,0.0,0.000000,0.00000,0.437075,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.218604,0.0,0.000000,0.000000
2348,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
2349,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000


### Finding the Nearest Neighbors

In [8]:
# Recommendation Model
from sklearn.neighbors import NearestNeighbors

In [16]:
# Instantiate the knn model
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

# Fit (train) the model on the TF-IDF vector dataframe created above
nn.fit(vdtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [15]:
# This trained vocabulary is what we want to pickle and use in the app
import pickle
with open("nn_rec_model.pkl", "wb") as p:
    pickle.dump(nn, p)

In [17]:
input1 = "sativa,happy,energetic,focused,euphoric,earthy,woody,flowery"
num_recs = 10

# Create vector using the vocab that was fit above
input_vector = tfidf.transform([input1])

# Use NN model to calculate the top n similar strains
top_id = nn.kneighbors(input_vector.todense(), n_neighbors=num_recs)[1][0]

In [18]:
pd.set_option('max_colwidth', 60)

# Index-locate the neighbors in original dataframe
top_df = df1.iloc[top_id]

top_df

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
2335,Y-Griega,sativa,4.8,"Happy,Energetic,Uplifted,Focused,Euphoric","Earthy,Woody,Flowery","Also known as simply “Y,” the 80% sativa Y Griega is an ..."
2129,Thai-Tanic,sativa,4.0,"Energetic,Uplifted,Happy,Focused,Euphoric","Sweet,Earthy,Woody",Thai-Tanic is a very compact sativa variety with that cl...
8,3D-Cbd,sativa,4.6,"Uplifted,Focused,Happy,Talkative,Relaxed","Earthy,Woody,Flowery",3D CBD from Snoop Dogg’s branded line of cannabis strain...
987,Harlequin,sativa,4.3,"Relaxed,Focused,Happy,Uplifted,Energetic","Earthy,Sweet,Woody",Harlequin is a 75/25 sativa-dominant strain renowned for...
2125,Thai,sativa,4.2,"Happy,Relaxed,Focused,Uplifted,Energetic","Earthy,Flowery,Sweet",Thai refers to a cannabis variety that grows natively in...
475,Charlottes-Web,sativa,4.5,"Relaxed,Uplifted,Focused,Happy,Energetic","Earthy,Flowery,Sweet",Charlotte’s Web is a cultivar with less than 0.3% THC th...
1100,Jack-Herer,sativa,4.4,"Happy,Uplifted,Energetic,Focused,Euphoric","Earthy,Pine,Woody",Jack Herer is a sativa-dominant cannabis strain that has...
948,Green-Haze,sativa,3.8,"Happy,Talkative,Creative,Focused,Hungry","Woody,Flowery,Earthy",Green Haze by A.C.E. Seeds is another version of their s...
1164,Kali-Mist,sativa,4.1,"Energetic,Focused,Uplifted,Euphoric,Creative","Woody,Earthy,Citrus","Kali Mist is known to deliver clear-headed, energetic ef..."
2047,Super-Green-Crack,sativa,4.5,"Happy,Giggly,Energetic,Focused,Euphoric","Earthy,Flowery,Pungent",Super Green Crack is a true sativa. Like a cup of strong...


![Mobile Filters](../screenshots/Desktop-3-Discover.png)

---

## The FuncZone

In [13]:
def recommend(req, n=10):
    """Function to recommend top n strains given a request."""
    # Create vector from request
    req_vec = tfidf.transform([req])

    # Access the top n indexes
    top_id = nn.kneighbors(req_vec.todense(), n_neighbors=n)[1][0]

    # Index-locate the neighbors in original dataframe
    top_df = df1.iloc[top_id]

    return top_df

### JSON Version

In [14]:
# The API should return a JSON object with only the ids
# Here's a slightly modified version to accomplish that
def recommend_json(req, n=10):
    """Function to recommend top n strains given a request."""
    # Create vector from request
    req_vec = tfidf.transform([req])

    # Access the top n indexes
    rec_id = nn.kneighbors(req_vec.todense(), n_neighbors=n)[1][0]

    # Convert np.ndarray to pd.Series then to JSON
    rec_json = pd.Series(rec_id).to_json(orient="records")

    return rec_json

---

## Flask API

In [19]:
# Example request from back end
request1 = "rec/10/sativa,happy,energetic,focused,euphoric,earthy,woody,flowery"

In [None]:
from flask import Flask
import pickle
import pandas as pd

import os
from dotenv import load_dotenv

load_dotenv()

app = Flask(__name__)


# Load in the dataset
df = pd.read_csv("data/cannabis.csv")

# Extract index as column
df2 = df.reset_index()

# Load in the pickled vectorizer and knn model
with open("data/vect_02.pkl", "rb") as p:
    tfidf = pickle.load(p)

with open("data/knn_02.pkl", "rb") as p:
    nn = pickle.load(p)


def recommend(request, n=10):
    """
    Creates a dataframe with top n recommended strains.

    Parameters
    ----------
    request : string
        List of user's desired effects, concatenated into a single string.
        Separated by commas.
    n : int, optional
        Number of recommendations to return, by default 10.

    Returns
    -------
    recs
        Returns a list of recommended strains.
    """

    # Create vector from request string
    request_vec = tfidf.transform([request])

    # Use knn model to calculate the top n strains
    # The recommendations are the top n nearest points (vectors) to the
    # vectorized request, based on the vectorized dataset (vocab).
    rec_id = nn.kneighbors(request_vec.todense(), n_neighbors=n)[1][0]

    # Convert np.ndarray to pd.Series then to JSON
    rec_json = pd.Series(rec_id).to_json(orient="records")

    return rec_json


@app.route("/rec/<int:n>/<effects>")
def rec(effects, n=10):
    """
    Primary recommendation route.

    Parameters
    ----------
    n : int, optional
        Number of recommendations to return, by default 10.
    effects : string
        List of desired effects, comma-delimited.

    Returns
    -------
    top : JSON
        Returns a JSON array of top n recommendations.
    """

    try:
        top = recommend(effects, n)
    except Exception as e:
        raise e

    return str(top)


@app.route("/strains")
def strains():
    """
    Endpoint that returns a list of all available strains.

    Returns
    -------
    strains : JSON
        Returns a JSON array of all available strains.
    """

    try:
        strains = df2.to_json(orient="records")
    except Exception as e:
        raise e

    return strains
