In [1]:
import pandas as pd
import os
from spmf import Spmf

In [2]:
# Preprocessing
data = pd.read_csv(r'../initial-data/Herbals and preperations.csv', encoding='latin-1')
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
columns_to_remove = ['herb_part', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
data = data.drop(columns=columns_to_remove)

data.loc[data.bot_name.isin(['terminalia chebula', 'terminalia bellarica', 'phyllanthus emblica']), 'bot_name'] = 'triphala'
data.loc[data.bot_name.isin(['piper nigrum', 'piper longum', 'zingiber officinale']), 'bot_name'] = 'trikatu'
data.loc[(data.disease_category == 'diabetes') & (data.bot_name == 'saccharum officinarum'), 'bot_name'] = 'tinospora cordifolia'

In [3]:
# Convert all unique values in bot_name to a new series to be used as id
bot_name = data['bot_name'].unique()
bot_name = pd.Series(bot_name)
bot_name = bot_name.reset_index()
bot_name.columns = ['id', 'bot_name']
bot_name['id'] = bot_name['id'] + 1
if not os.path.exists(f'./split-data'):
    os.makedirs(f'./split-data')
bot_name.to_csv(r'./split-data/ids.csv', index=False)

In [4]:
# Splitting

data_dia = data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'diabetes'])].copy()
data_tub =  data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'tuberculosis '])].copy()
data_dia_aga = data_dia.loc[data_dia['author'].apply(lambda x: x in ['agathiyar'])]
data_dia_the = data_dia.loc[data_dia['author'].apply(lambda x: x in ['therayar'])]
data_tub_aga = data_tub.loc[data_tub['author'].apply(lambda x: x in ['agathiyar'])]
data_tub_the = data_tub.loc[data_tub['author'].apply(lambda x: x in ['therayar'])]

In [5]:
def write_split_data(data, name):
    # group using the id instead of name
    data = pd.merge(data, bot_name, on='bot_name', how='left')
    # Groupby drug and output id of all bot_names instead of name in the text file
    def remove_duplicates(row):
        numbers = row.split()
        unique_numbers = list(set(numbers))
        return ' '.join(unique_numbers)
    grouped_data = data.groupby('drug')['id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
    grouped_data['id'] = grouped_data['id'].apply(remove_duplicates)
    # grouped_data = data.groupby('drug')['bot_name'].apply(lambda x: ' '.join(x)).reset_index()
    # grouped_data
    # # Save the result to a text file
    with open(f'./split-data/{name}.txt', 'w') as f:
        for index, row in grouped_data.iterrows():
            f.write(row['id'] + '\n')

In [6]:
all_data = {
    'Diabetic-Data-Overall': data_dia,
    'Diabetic-Data-Agathiyar': data_dia_aga,
    'Diabetic-Data-Therayar': data_dia_the,
    'Tuberculosis-Data-Overall': data_tub,
    'Tuberculosis-Data-Agathiyar': data_tub_aga,
    'Tuberculosis-Data-Therayar': data_tub_the
    }
for name, data in all_data.items():
    write_split_data(data, name)

In [7]:
def runSPMF(algorithm, data, name):
    # Create algorithm folder if not exists
    if not os.path.exists(f'./output-data/{algorithm}'):
        os.makedirs(f'./output-data/{algorithm}')
    # Run SPMF
    os.system( f"java -jar spmf.jar run {algorithm} ./split-data/{name}.txt ./output-data/{algorithm}/{name}.txt 15%")
    # Read the result from the output txt file
    with open(f'./output-data/{algorithm}/{name}.txt', 'r') as f:
        # Read output in format => 52 53  #SUP: 
        result = f.read()
    # Split the result into lines
    result = result.split('\n')
    out_dic = {}
    for row in result:
        # Get the bot_name and support value
        if not row:
            continue
        values = row.split(' #SUP: ')
        out_dic[str(values[0])] = values[1]
    # Out_dic to df
    out_df = pd.DataFrame.from_dict(out_dic, orient='index').reset_index()
    # Rename columns
    out_df.columns = ['bot_name', 'support']
    
    def to_bot_name(row):
        ids = row.split()
        bot_names = []
        for id in ids:
            bot_names.append(bot_name.loc[bot_name['id'] == int(id), 'bot_name'].values[0])
        return bot_names
    
    out_df['bot_name'] = out_df['bot_name'].apply(to_bot_name)
    return out_df

In [9]:
all_data_itemset = {}
for name, data in all_data.items():
    all_data_itemset['Apriori_'+ name] = runSPMF('Apriori', data, name)
    all_data_itemset['FPGrowth_itemsets_' + name] = runSPMF('FPGrowth_itemsets', data, name)
    all_data_itemset['Apriori_TID_' + name] = runSPMF('Apriori_TID', data, name)
    all_data_itemset['Relim_' + name] = runSPMF('Relim', data, name)
    all_data_itemset['Eclat_' + name] = runSPMF('Eclat', data, name)

>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 10
 The algorithm stopped at size 2
 Frequent itemsets count : 4
 Maximum memory usage : 7.812141418457031 mb
 Total time ~ 8 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Max memory usage: 7.813987731933594 mb 
 Frequent itemsets count : 4
 Total time ~ 16 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets count : 4
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 21 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Number of frequent  itemsets: 4
 Total time ~: 12 ms
 Max memory:7.813987731933594
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets c

In [33]:
import pandas as pd

def generate_association_rules(data, min_support=2, min_confidence=0.7):
  """
  Generates association rules from a DataFrame of frequent itemsets.

  Args:
    data: A pandas DataFrame with columns "bot_name" (list of items) and "support" (frequency).
    min_support: Minimum support threshold.
    min_confidence: Minimum confidence threshold.

  Returns:
    A pandas DataFrame with columns "antecedent", "consequent", "support_antecedent", "support_union", "confidence".
  """

  rules = []
  for i in range(1, len(data)):
    for j in range(i):
      antecedent = set(data.loc[i, "bot_name"])
      consequent = set(data.loc[j, "bot_name"]) - antecedent
      if len(consequent) > 0:
        support_antecedent = data.loc[i, "support"]
        support_union = data.loc[i, "support"] + data.loc[j, "support"]
        confidence = support_union / support_antecedent
        if confidence >= min_confidence:
          rules.append((antecedent, consequent, support_antecedent, support_union, confidence))

  rules = [rule for rule in rules if rule[3] >= min_support]

  return pd.DataFrame(rules, columns=["antecedent", "consequent", "support_antecedent", "support_union", "confidence"])

# Example usage
association_rules = generate_association_rules(all_data_itemset['Relim_Tuberculosis-Data-Therayar'])


In [32]:
association_rules

Unnamed: 0,antecedent,consequent,support_antecedent,support_union,confidence
0,"{saccharum officinarum, cuminum cyminum, myris...",{abies spectabilis},2,4,2.0
1,"{saccharum officinarum, cuminum cyminum, myris...",{abies spectabilis},2,4,2.0
2,"{saccharum officinarum, cuminum cyminum, myris...",{plectranthus vettiveroides},2,4,2.0
3,"{saccharum officinarum, cuminum cyminum, myris...",{plectranthus vettiveroides},2,4,2.0
4,"{saccharum officinarum, cuminum cyminum, myris...","{plectranthus vettiveroides, trikatu}",2,4,2.0
...,...,...,...,...,...
613453,{costus speciosus},{abies spectabilis},5,9,1.8
613454,{costus speciosus},"{trikatu, abies spectabilis}",5,7,1.4
613455,{costus speciosus},{abies spectabilis},5,7,1.4
613456,{costus speciosus},{trikatu},5,10,2.0


In [None]:
from mlxtend.frequent_patterns import association_rules
import pandas as pd

df = all_data_itemset['Relim_Tuberculosis-Data-Therayar']

# Transform the data into a binary format for association rule mining
def encode_units(x):
    return 1 if x else 0

# Apply one-hot encoding to the 'bot_name' column
basket_sets = df['bot_name'].apply(lambda x: pd.Series({item: encode_units(item in x) for item in x}))

# Generate frequent itemsets using Apriori algorithm
frequent_itemsets = basket_sets.dropna().astype(int)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Display the association rules
print(rules)