In [284]:
import pandas as pd
import numpy as np
import re

In [285]:
drop_columns = ['id', 'slug', 'thc', 'cbd']

In [286]:
products = pd.read_csv("./Data/kushy_datasets/Products/products-kushy_api.2017-11-14.csv")

In [287]:
products['strain'] = products['strain'].str.replace('-', ' ')

In [288]:
products = products.drop(columns=drop_columns)

In [289]:
strains = pd.read_csv("./Data/kushy_datasets/Strains/strains-kushy_api.2017-11-14.csv")

In [290]:
strains = strains.rename(columns={"name": "strain"})

In [291]:
strains['strain'] = strains['strain'].str.replace('-', ' ')

In [292]:
strains = strains.drop(columns=drop_columns)

In [293]:
df = products.merge(strains, how='outer', on='strain', suffixes=('_prod_db', '_strain_db'))

In [294]:
ratings = pd.read_csv("./Data/cannabis.csv", na_values=[0, '0'])

In [295]:
ratings['Strain'] = ratings['Strain'].str.replace('-', ' ')

In [296]:
ratings = ratings.rename(columns={"Strain": "strain"})

In [297]:
df = df.merge(ratings, how='left', on='strain', suffixes=('_', '_ratings_db'))

In [298]:
def join_columns(df, columns, prefix='_'):
    if prefix == '_':
        prefix = ''
    else:
        prefix += '_'
    value_set = set()
    df_row_values = []
    
    for row in range(len(df)):
        row_values = set()
        for col in columns:
            try:
                val = re.sub('\s+', '_', df.iloc[row][col].strip())
                
                for value in val.split(','):
                    value_set.add(value.lower().strip('_'))
                    row_values.add(value.lower().strip('_'))
            except AttributeError as e:
                pass
        df_row_values.append(row_values)
    
    df_copy = df.copy()
    df_copy[[prefix+x for x in value_set]] = 0
    
    for i, row in enumerate(df_row_values):
        for col in row:
            df_copy.loc[i, prefix+col] = 1
    
    print(columns)
    print(value_set)            
    df_copy = df_copy.drop(columns=columns)
    
    return df_copy
                
                

In [299]:
df = join_columns(df, ['effects', 'Effects'], 'effect')

['effects', 'Effects']
{'sleepy', 'happy', 'dry_mouth', 'relaxed', 'creative', 'tingly', 'uplifted', 'euphoric', 'paranoid', 'aroused', 'hungry', 'energetic', 'mouth', 'giggly', 'talkative', 'dry', 'focused', 'anxious', 'none', 'horny'}


In [300]:
df = join_columns(df, ['Flavor', 'flavor'], 'flavor')

['Flavor', 'flavor']
{'skunk', 'lavender', 'orange', 'fruit', 'tea', 'coffee', 'blueberry', 'butter', 'mint', 'pungent', 'tree', 'strawberry', 'honey', 'pine', 'minty', 'nutty', 'tropical', 'earthy', 'cheese', 'flowery', 'mango', 'tobacco', 'citrus', 'violet', 'grape', 'sweet', 'rose', 'lemon', 'vanilla', 'menthol', 'sage', 'apricot', 'none', 'pineapple', 'berry', 'plum', 'chestnut', 'pear', 'chemical', 'ammonia', 'spicy/herbal', 'lime', 'peach', 'tar', 'pepper', 'apple', 'woody', 'grapefruit', 'blue', 'diesel'}


In [301]:
df['flavor_tree_fruit'] = df['flavor_tree'] & df['flavor_fruit']
df = df.drop(columns=['flavor_tree', 'flavor_fruit'])

In [302]:
df['flavor_blue_cheese'] = df['flavor_blue'] & df['flavor_cheese']
df = df.drop(columns=['flavor_blue'])

In [303]:
df = join_columns(df, ['ailment'], 'ailment')

['ailment']
{'inflammation', 'lack_of_appetite', 'nausea', 'pain', 'seizures', 'stress', 'insomnia', 'muscle_spasms', 'depression'}


In [304]:
df = join_columns(df, ['category'], 'category')

['category']
{'', 'edibles', 'kief', 'shatter', 'spread', 'wax', 'candy', 'vape_cartidge', 'disposable_vape', 'soup', 'oil', 'crumble', 'pill', 'salt', 'rso', 'dressing', 'pre-roll', 'snack', 'flowers', 'tincture', 'bath', 'bubble_hash', 'vapes', 'concentrate', 'drink', 'topical', 'chocolate'}


In [305]:
df = join_columns(df, ['Type', 'type'], 'type')

['Type', 'type']
{'outdoo', 'organi', 'indoor', 'inorga', 'sativa', 'indica', 'hybrid'}


In [306]:
if 'none' in df.columns:
    df = df.drop(columns=['none'])

In [307]:
df = df.drop(columns=['crosses', 'image', 'description', 'status', 'sort', 
                 'lab_test', 'breeder', 'location', 'terpenes', 'thca', 'thcv', 
                 'cbda', 'cbdv', 'cbn', 'cbg', 'cbgm', 'cbgv', 'cbc', 
                 'cbcv', 'cbv', 'cbe', 'cbt', 'cbl', 'category_'])

In [308]:
df = df.rename(columns={'type_outdoo':'type_outdoor', 'type_organi':'type_organic', 'type_inorga':'type_inorganic', 'Rating':'rating', 'Description':'description', 'flavor_spicy/herbal':'flavor_spicy_herbal'})

In [309]:
df.head()

Unnamed: 0,name,brand,strain,rating,description,effect_sleepy,effect_happy,effect_dry_mouth,effect_relaxed,effect_creative,...,category_drink,category_topical,category_chocolate,type_outdoor,type_organic,type_indoor,type_inorganic,type_sativa,type_indica,type_hybrid
0,Pre-Roll Package - Pre-roll,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1/2 ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1/4 ounce of cannabis,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1/8,The Humboldt Cure,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [310]:
df.to_csv('weed.csv', index=False)