In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def join_columns(df, columns, prefix='_'):
    if prefix == '_':
        prefix = ''
    else:
        prefix += '_'
    value_set = set()
    df_row_values = []
    
    for row in range(len(df)):
        row_values = set()
        for col in columns:
            try:
                val = re.sub('\s+', '_', df.iloc[row][col].strip())
                
                for value in val.split(','):
                    value_set.add(value.lower().strip('_'))
                    row_values.add(value.lower().strip('_'))
            except AttributeError as e:
                pass
        df_row_values.append(row_values)
    
    df_copy = df.copy()
    df_copy[[prefix+x for x in value_set]] = 0
    
    for i, row in enumerate(df_row_values):
        for col in row:
            df_copy.loc[i, prefix+col] = 1
    
    print(columns)
    print(value_set)            
    df_copy = df_copy.drop(columns=columns)
    
    return df_copy
                
                

In [6]:
drop_columns = ['id', 'slug', 'thc', 'cbd']

products = pd.read_csv("./Data/kushy_datasets/Products/products-kushy_api.2017-11-14.csv")
products['strain'] = products['strain'].str.replace('-', ' ')
products = products.drop(columns=drop_columns)

strains = pd.read_csv("./Data/kushy_datasets/Strains/strains-kushy_api.2017-11-14.csv")
strains = strains.rename(columns={"name": "strain"})
strains['strain'] = strains['strain'].str.replace('-', ' ')
strains = strains.drop(columns=drop_columns)

df = products.merge(strains, how='inner', on='strain', suffixes=('_prod_db', '_strain_db'))

ratings = pd.read_csv("./Data/cannabis.csv", na_values=[0, '0'])
ratings['Strain'] = ratings['Strain'].str.replace('-', ' ')
ratings['Strain'] = ratings['Strain'].str.replace('98 white widow', 'white widow')
ratings = ratings.rename(columns={"Strain": "strain"})

df = df.merge(ratings, how='left', on='strain', suffixes=('_', '_ratings_db'))

df = join_columns(df, ['Flavor', 'flavor'], 'flavor')
df['flavor_tree_fruit'] = df['flavor_tree'] & df['flavor_fruit']
df = df.drop(columns=['flavor_tree', 'flavor_fruit'])
df['flavor_blue_cheese'] = df['flavor_blue'] & df['flavor_cheese']
df = df.drop(columns=['flavor_blue'])

df = join_columns(df, ['effects', 'Effects'], 'effect')
df = join_columns(df, ['ailment'], 'ailment')
df = join_columns(df, ['category'], 'category')
df = join_columns(df, ['Type', 'type'], 'type')

if 'none' in df.columns:
    df = df.drop(columns=['none'])
df = df.drop(columns=['crosses', 'image', 'description', 'status', 'sort', 
                 'lab_test', 'breeder', 'location', 'terpenes', 'thca', 'thcv', 
                 'cbda', 'cbdv', 'cbn', 'cbg', 'cbgm', 'cbgv', 'cbc', 
                 'cbcv', 'cbv', 'cbe', 'cbt', 'cbl', 'category_'])

df = df.rename(columns={
                'type_outdoo':'type_outdoor', 
                'type_organi':'type_organic', 
                'type_inorga':'type_inorganic', 
                'Rating':'rating', 
                'Description':'description', 
                'flavor_spicy/herbal':'flavor_spicy_herbal'})


['Flavor', 'flavor']
{'earthy', 'tree', 'pine', 'vanilla', 'sweet', 'chestnut', 'apricot', 'apple', 'skunk', 'grape', 'none', 'diesel', 'lime', 'honey', 'pungent', 'fruit', 'violet', 'menthol', 'chemical', 'cheese', 'mango', 'orange', 'lavender', 'plum', 'citrus', 'tar', 'spicy/herbal', 'minty', 'butter', 'peach', 'nutty', 'mint', 'berry', 'ammonia', 'pear', 'rose', 'pineapple', 'blue', 'tea', 'flowery', 'strawberry', 'grapefruit', 'sage', 'blueberry', 'woody', 'tobacco', 'tropical', 'pepper', 'coffee', 'lemon'}
['effects', 'Effects']
{'giggly', 'paranoid', 'uplifted', 'mouth', 'aroused', 'horny', 'sleepy', 'dry', 'talkative', 'none', 'hungry', 'tingly', 'energetic', 'relaxed', 'focused', 'dry_mouth', 'creative', 'euphoric', 'happy', 'anxious'}
['ailment']
{'insomnia', 'lack_of_appetite', 'stress', 'nausea', 'pain', 'inflammation', 'depression', 'muscle_spasms'}
['category']
{'', 'crumble', 'pre-roll', 'drink', 'bath', 'soup', 'spread', 'wax', 'vape_cartidge', 'snack', 'kief', 'candy',

In [7]:
df

Unnamed: 0,name,brand,strain,rating,description,flavor_earthy,flavor_pine,flavor_vanilla,flavor_sweet,flavor_chestnut,...,category_oil,category_concentrate,category_rso,category_shatter,category_topical,category_edibles,category_flowers,type_indica,type_hybrid,type_sativa
0,Pre-Roll Package - Pre-roll,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1/2 ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1/4 ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1/8,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16702,Red Bull - Oil,Unknown Producer,Red Bull,,,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
16703,Blue Hawaiian - Wax,Platinum Extracts,Blue Hawaiian,4.3,The Blue Hawaiian strain is known for being qu...,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
16704,Durban Dream - Water Hash,Unknown Producer,Durban Dream,,,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
16705,Chem3 OG - Wax,Unknown Producer,Chem3 OG,,,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [17]:
df = df.set_index('name')

In [18]:
df = df.drop_duplicates()

In [19]:
df.shape

(11678, 110)

In [20]:
df

Unnamed: 0_level_0,brand,strain,rating,description,flavor_earthy,flavor_pine,flavor_vanilla,flavor_sweet,flavor_chestnut,flavor_apricot,...,category_oil,category_concentrate,category_rso,category_shatter,category_topical,category_edibles,category_flowers,type_indica,type_hybrid,type_sativa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pre-Roll Package - Pre-roll,The Humboldt Cure,,,,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
CBD-Rich Recovery Salve - 5ml Sample,Turtle Bud,,,,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
Lift Tickets Gucci OG Rosin,Lift Tickets,,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Salve CBD,Innovative Extrations,,,,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Red Bull - Oil,Unknown Producer,Red Bull,,,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
Blue Hawaiian - Wax,Platinum Extracts,Blue Hawaiian,4.3,The Blue Hawaiian strain is known for being qu...,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
Durban Dream - Water Hash,Unknown Producer,Durban Dream,,,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
Chem3 OG - Wax,Unknown Producer,Chem3 OG,,,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [27]:
df = df.dropna(subset=['strain'])

In [28]:
df.to_csv('./Data/weed.csv')