In [7]:
import pandas as pd
import os
import numpy
from mlxtend.frequent_patterns import association_rules

In [8]:
# Preprocessing
data = pd.read_csv(r'../initial-data/Herbals and preperations.csv', encoding='latin-1')
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
columns_to_remove = ['herb_part', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
data = data.drop(columns=columns_to_remove)

data.loc[data.bot_name.isin(['terminalia chebula', 'terminalia bellarica', 'phyllanthus emblica']), 'bot_name'] = 'triphala'
data.loc[data.bot_name.isin(['piper nigrum', 'piper longum', 'zingiber officinale']), 'bot_name'] = 'trikatu'
data.loc[(data.disease_category == 'diabetes') & (data.bot_name == 'saccharum officinarum'), 'bot_name'] = 'tinospora cordifolia'

In [9]:
# Convert all unique values in bot_name to a new series to be used as id
bot_name = data['bot_name'].unique()
bot_name = pd.Series(bot_name)
bot_name = bot_name.reset_index()
bot_name.columns = ['id', 'bot_name']
bot_name['id'] = bot_name['id'] + 1
if not os.path.exists(f'./split-data'):
    os.makedirs(f'./split-data')
bot_name.to_csv(r'./split-data/ids.csv', index=False)

In [10]:
# Splitting

data_dia = data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'diabetes'])].copy()
data_tub =  data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'tuberculosis '])].copy()
data_dia_aga = data_dia.loc[data_dia['author'].apply(lambda x: x in ['agathiyar'])]
data_dia_the = data_dia.loc[data_dia['author'].apply(lambda x: x in ['therayar'])]
data_tub_aga = data_tub.loc[data_tub['author'].apply(lambda x: x in ['agathiyar'])]
data_tub_the = data_tub.loc[data_tub['author'].apply(lambda x: x in ['therayar'])]

In [11]:
def write_split_data(data, name):
    # group using the id instead of name
    data = pd.merge(data, bot_name, on='bot_name', how='left')
    # Groupby drug and output id of all bot_names instead of name in the text file
    def remove_duplicates(row):
        numbers = row.split()
        unique_numbers = list(set(numbers))
        return ' '.join(unique_numbers)
    grouped_data = data.groupby('drug')['id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
    grouped_data['id'] = grouped_data['id'].apply(remove_duplicates)
    # grouped_data = data.groupby('drug')['bot_name'].apply(lambda x: ' '.join(x)).reset_index()
    # grouped_data
    # # Save the result to a text file
    with open(f'./split-data/{name}.txt', 'w') as f:
        for index, row in grouped_data.iterrows():
            f.write(row['id'] + '\n')

In [12]:
all_data = {
    'Diabetic-Data-Overall': data_dia,
    'Diabetic-Data-Agathiyar': data_dia_aga,
    'Diabetic-Data-Therayar': data_dia_the,
    'Tuberculosis-Data-Overall': data_tub,
    'Tuberculosis-Data-Agathiyar': data_tub_aga,
    'Tuberculosis-Data-Therayar': data_tub_the
    }

for name, data in all_data.items():
    write_split_data(data, name)

In [47]:
def runSPMF(algorithm:str, name:str, support:int = 15):
    # Create algorithm folder if not exists
    if not os.path.exists(f'./output-data/{algorithm}'):
        os.makedirs(f'./output-data/{algorithm}')
    # Run SPMF
    os.system( f"java -jar spmf.jar run {algorithm} ./split-data/{name}.txt ./output-data/{algorithm}/{name}.txt {support}%")
    # Read the result from the output txt file
    with open(f'./output-data/{algorithm}/{name}.txt', 'r') as f:
        # Read output in format => 52 53  #SUP: 
        result = f.read()
    # Split the result into lines
    result = result.split('\n')
    out_dic = {}
    for row in result:
        # Get the bot_name and support value
        if not row:
            continue
        values = row.split(' #SUP: ')
        out_dic[str(values[0])] = values[1]
    out_df = pd.DataFrame.from_dict(out_dic, orient='index').reset_index()
    # if out_df is empty create aan empty dataframe with columns ['bot_name', 'support']
    if out_df.empty:
        out_df = pd.DataFrame(columns=['bot_name', 'support'])

    # Out_dic to df
    out_df.columns = ['bot_name', 'support']
    # Rename columns
    
    def to_bot_name(row):
        ids = row.split()
        bot_names = []
        for id in ids:
            bot_names.append(bot_name.loc[bot_name['id'] == int(id), 'bot_name'].values[0])
        return bot_names
    
    # out_df['bot_name'] = out_df['bot_name'].apply(to_bot_name)
    
    return out_df


In [56]:
def create_association_rules(df: pd.DataFrame, name: str, min_threshold: float = 0.7):
    
    df = df.copy()

    # Convert the 'support' column to integers
    try:
        df['bot_name'] = df['bot_name'].str.split()
    # Convert the 'bot_name' lists to frozensets
    except Exception as e:
        print(df)
        print(name)
        print(min_threshold)
        raise e
    df['support'] = df['support'].astype(int)
    df['bot_name'] = df['bot_name'].apply(frozenset)
    df.rename(columns={'bot_name': 'itemsets'}, inplace=True)
    # Create association rules
    if df.empty:
        rules = pd.DataFrame(columns=['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction'])
    else:
        rules = association_rules(df, metric='confidence', min_threshold=min_threshold)
        
    # If still empty print it in red color
    if rules.empty:
        print('\033[91m' + f'No rules found for {name} with min_threshold {min_threshold}' + '\033[0m')
    # Display the generated association rules
    return rules


In [57]:
# A function that calls run spmf and then create_association_rules to get association runs
def mean_performance(algorithm:str, name:str):
    support_list = [5, 10, 15, 20, 25]
    confidence_list = [0.4, 0.5, 0.6, 0.7, 0.8]
    performance_score = 0
    for support in support_list:
        for confidence in confidence_list:
            performance_score += len(create_association_rules(runSPMF(algorithm, name, support=support), name, min_threshold=confidence))
    return numpy.mean(performance_score)

In [58]:
def get_algorithms_scores(name:str):
    algorithms = [
        'Apriori', 
        'FPGrowth_itemsets', 
        'Apriori_TID', 
        'Relim', 
        'Eclat'
        ]
    
    avg_scores = {}
    for algorithm in algorithms:
        avg_scores[algorithm] = mean_performance(algorithm, name=name)
    return avg_scores

In [59]:
# 'Diabetic-Data-Overall'
# 'Diabetic-Data-Agathiyar'
# 'Diabetic-Data-Therayar'
# 'Tuberculosis-Data-Overall'
# 'Tuberculosis-Data-Agathiyar'
# 'Tuberculosis-Data-Therayar'

# get_algorithms_scores('Diabetic-Data-Agathiyar')

>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 595
 The algorithm stopped at size 3
 Frequent itemsets count : 35
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 13 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 595
 The algorithm stopped at size 3
 Frequent itemsets count : 35
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 16 ms
[91mNo rules found for Diabetic-Data-Agathiyar with min_threshold 0.5[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 595
 The algorithm stopped at size 3
 Frequent itemsets count : 35
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 12 ms
[91mNo rules found for Diabetic-Data-Agathiyar with min_threshold 0.6[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 595
 The algori

In [None]:
data_scores = {}
for name, data in all_data.items():
    data_scores[name] = get_algorithms_scores(name=name)

In [None]:
data_scores