In [4]:
# Load required packages

import numpy as np
import pandas as pd
from efficient_apriori import apriori

In [7]:
# Load dataset
plants = pd.read_table("data/plants.data", encoding='latin-1', names=['col'])
states = pd.read_csv("data/stateabbr.txt", encoding='latin-1',skiprows=[0, 2, 58], names=['States'])
all_states = states['States'].str.split(' ').apply(pd.Series).iloc[:, 0].to_numpy()  # Might not be needed

In [8]:
# I do a bit of simple data cleaning to get it ready as transactions
temp = plants['col'].str.split(',').to_numpy()
transactions = []
for i in range(len(temp)):
  lst = tuple(temp[i][1:])
  transactions.append(lst)

# Using the apriori algorithm, we specify values for min_support and minimum_confidence
itemsets, rules = apriori(transactions=transactions, min_support=0.1, min_confidence=0.95)
print(rules[0])

{fl, ms} -> {al} (conf: 0.955, supp: 0.103, lift: 5.825, conv: 18.536)


In [9]:
# A more complicated data cleaning for further machine learning tasks
df = plants['col'].str.split(',').apply(pd.Series)  # plant specie and state codes are separated by comma
df.rename(columns={0: 'specie'}, inplace=True)

In [10]:
df_temp = pd.melt(df, ['specie'])  # to prepare the data for one hot encoding, we first transform it to a long format
df_temp.drop(columns=['variable'], inplace=True)  # drop the variable column created from column names
df_temp.rename(columns={'value': 'state'}, inplace=True)  # rename column called value to state
df_temp.dropna(inplace=True)  # remove NA values at this stage
df_temp.reset_index(inplace=True)  # Index is messed up as a result of dropna so we reset it
df_temp.drop('index', axis=1, inplace=True)  # duplicated index column created from reset_index. It is deleted.
specie = df_temp['specie']  # Store all names of specie based on the order of appearance to be used later

In [11]:
df_ohe = pd.get_dummies(df_temp.state)  # step 1 of creating one hot encoding

# Since we melted the dataset previously and transformed it to a long format, it meant that df_ohe contains one hot encoding for one specie and one 
# location. The correct form is one specie and all locations.

# To correct this, we will add the specie column we previously extracted to df_ohe
df_ohe['specie'] = specie

# Groupy the the dataset by species and combine (add) the differe one hot encodings together to give the final resuly
def add_ohe(df):
    combined = df.sum(axis=0)
    return combined

df_ohe = df_ohe.groupby('specie').apply(add_ohe)
df_ohe.drop('specie', axis=1, inplace=True)

In [None]:
labels = df_ohe.index.to_numpy()  # create labels
features = df_ohe.to_numpy()  # create features