In [1]:
import pandas as pd
import os
from mlxtend.frequent_patterns import association_rules

In [2]:
# Preprocessing
data = pd.read_csv(r'../initial-data/Herbals and preperations.csv', encoding='latin-1')
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
columns_to_remove = ['herb_part', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
data = data.drop(columns=columns_to_remove)

data.loc[data.bot_name.isin(['terminalia chebula', 'terminalia bellarica', 'phyllanthus emblica']), 'bot_name'] = 'triphala'
data.loc[data.bot_name.isin(['piper nigrum', 'piper longum', 'zingiber officinale']), 'bot_name'] = 'trikatu'
data.loc[(data.disease_category == 'diabetes') & (data.bot_name == 'saccharum officinarum'), 'bot_name'] = 'tinospora cordifolia'

In [3]:
# Convert all unique values in bot_name to a new series to be used as id
bot_name = data['bot_name'].unique()
bot_name = pd.Series(bot_name)
bot_name = bot_name.reset_index()
bot_name.columns = ['id', 'bot_name']
bot_name['id'] = bot_name['id'] + 1
if not os.path.exists(f'./split-data'):
    os.makedirs(f'./split-data')
bot_name.to_csv(r'./split-data/ids.csv', index=False)

In [4]:
# Splitting

data_dia = data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'diabetes'])].copy()
data_tub =  data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'tuberculosis '])].copy()
data_dia_aga = data_dia.loc[data_dia['author'].apply(lambda x: x in ['agathiyar'])]
data_dia_the = data_dia.loc[data_dia['author'].apply(lambda x: x in ['therayar'])]
data_tub_aga = data_tub.loc[data_tub['author'].apply(lambda x: x in ['agathiyar'])]
data_tub_the = data_tub.loc[data_tub['author'].apply(lambda x: x in ['therayar'])]

In [5]:
def write_split_data(data, name):
    # group using the id instead of name
    data = pd.merge(data, bot_name, on='bot_name', how='left')
    # Groupby drug and output id of all bot_names instead of name in the text file
    def remove_duplicates(row):
        numbers = row.split()
        unique_numbers = list(set(numbers))
        return ' '.join(unique_numbers)
    grouped_data = data.groupby('drug')['id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
    grouped_data['id'] = grouped_data['id'].apply(remove_duplicates)
    # grouped_data = data.groupby('drug')['bot_name'].apply(lambda x: ' '.join(x)).reset_index()
    # grouped_data
    # # Save the result to a text file
    with open(f'./split-data/{name}.txt', 'w') as f:
        for index, row in grouped_data.iterrows():
            f.write(row['id'] + '\n')

In [6]:
all_data = {
    'Diabetic-Data-Overall': data_dia,
    'Diabetic-Data-Agathiyar': data_dia_aga,
    'Diabetic-Data-Therayar': data_dia_the,
    'Tuberculosis-Data-Overall': data_tub,
    'Tuberculosis-Data-Agathiyar': data_tub_aga,
    'Tuberculosis-Data-Therayar': data_tub_the
    }
for name, data in all_data.items():
    write_split_data(data, name)

In [11]:
def runSPMF(algorithm, data, name):
    # Create algorithm folder if not exists
    if not os.path.exists(f'./output-data/{algorithm}'):
        os.makedirs(f'./output-data/{algorithm}')
    # Run SPMF
    os.system( f"java -jar spmf.jar run {algorithm} ./split-data/{name}.txt ./output-data/{algorithm}/{name}.txt 5%")
    # Read the result from the output txt file
    with open(f'./output-data/{algorithm}/{name}.txt', 'r') as f:
        # Read output in format => 52 53  #SUP: 
        result = f.read()
    # Split the result into lines
    result = result.split('\n')
    out_dic = {}
    for row in result:
        # Get the bot_name and support value
        if not row:
            continue
        values = row.split(' #SUP: ')
        out_dic[str(values[0])] = values[1]
    # Out_dic to df
    out_df = pd.DataFrame.from_dict(out_dic, orient='index').reset_index()
    # Rename columns
    out_df.columns = ['bot_name', 'support']
    
    def to_bot_name(row):
        ids = row.split()
        bot_names = []
        for id in ids:
            bot_names.append(bot_name.loc[bot_name['id'] == int(id), 'bot_name'].values[0])
        return bot_names
    
    # out_df['bot_name'] = out_df['bot_name'].apply(to_bot_name)
    
    return out_df

In [12]:
all_data_itemset = {}
for name, data in all_data.items():
    all_data_itemset['Apriori_'+ name] = runSPMF('Apriori', data, name)
    all_data_itemset['FPGrowth_itemsets_' + name] = runSPMF('FPGrowth_itemsets', data, name)
    all_data_itemset['Apriori_TID_' + name] = runSPMF('Apriori_TID', data, name)
    all_data_itemset['Relim_' + name] = runSPMF('Relim', data, name)
    all_data_itemset['Eclat_' + name] = runSPMF('Eclat', data, name)

>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 903
 The algorithm stopped at size 3
 Frequent itemsets count : 44
 Maximum memory usage : 7.8136444091796875 mb
 Total time ~ 22 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Max memory usage: 8.028984069824219 mb 
 Frequent itemsets count : 278
 Total time ~ 25 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets count : 278
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 22 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Number of frequent  itemsets: 278
 Total time ~: 738 ms
 Max memory:92.00166320800781
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent

In [None]:
def create_association_rules(df: pd.DataFrame, name: str, min_threshold: float = 0.7):
    df = df.copy()
    df['bot_name'] = df['bot_name'].str.split()

    # Convert the 'support' column to integers
    df['support'] = df['support'].astype(int)

    # Convert the 'bot_name' lists to frozensets
    df['bot_name'] = df['bot_name'].apply(frozenset)
    df.rename(columns={'bot_name': 'itemsets'}, inplace=True)
    # Generate frequent itemsets using the 'bot_name' column
    # Create association rules
    rules = association_rules(df, metric='confidence', min_threshold=min_threshold)
    while rules.empty and min_threshold > 0:
        min_threshold = max(min_threshold - 0.3, 0)
        rules = association_rules(df, metric='confidence', min_threshold=min_threshold)
        
    # If still empty print it in red color
    if rules.empty:
        print('\033[91m' + f'No rules found for {name} with min_threshold {min_threshold}' + '\033[0m')
    # Display the generated association rules
    return rules


In [None]:
association_rules_dic = {}
for name, df in all_data_itemset.items():
    association_rules_dic[name] = create_association_rules(df, name=name)

[91mNo rules found for Apriori_Diabetic-Data-Overall with min_threshold 0[0m
[91mNo rules found for Apriori_Diabetic-Data-Agathiyar with min_threshold 0[0m
[91mNo rules found for Apriori_Diabetic-Data-Therayar with min_threshold 0[0m
[91mNo rules found for Apriori_Tuberculosis-Data-Overall with min_threshold 0[0m
[91mNo rules found for Apriori_Tuberculosis-Data-Agathiyar with min_threshold 0[0m


In [50]:
association_rules_dic

{'Apriori_Diabetic-Data-Overall': Empty DataFrame
 Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
 Index: [],
 'FPGrowth_itemsets_Diabetic-Data-Overall': Empty DataFrame
 Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
 Index: [],
 'Apriori_TID_Diabetic-Data-Overall': Empty DataFrame
 Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
 Index: [],
 'Relim_Diabetic-Data-Overall': Empty DataFrame
 Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
 Index: [],
 'Eclat_Diabetic-Data-Overall': Empty DataFrame
 Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zh