In [1]:
import pandas as pd
import os
import numpy
from mlxtend.frequent_patterns import association_rules
from FIM import apriori, fpgrowth, eclat, hmine
from FIM import association_rules
from FIM.utils import TransactionEncoder

In [2]:
# Preprocessing
data = pd.read_csv(r'../initial-data/Herbals and preperations.csv', encoding='latin-1')
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
columns_to_remove = ['herb_part', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
data = data.drop(columns=columns_to_remove)

data.loc[data.bot_name.isin(['terminalia chebula', 'terminalia bellarica', 'phyllanthus emblica']), 'bot_name'] = 'triphala'
data.loc[data.bot_name.isin(['piper nigrum', 'piper longum', 'zingiber officinale']), 'bot_name'] = 'trikatu'
data.loc[(data.disease_category == 'diabetes') & (data.bot_name == 'saccharum officinarum'), 'bot_name'] = 'tinospora cordifolia'

In [3]:
# Convert all unique values in bot_name to a new series to be used as id
bot_name = data['bot_name'].unique()
bot_name = pd.Series(bot_name)
bot_name = bot_name.reset_index()
bot_name.columns = ['id', 'bot_name']
bot_name['id'] = bot_name['id'] + 1
if not os.path.exists(f'./split-data'):
    os.makedirs(f'./split-data')
bot_name.to_csv(r'./split-data/ids.csv', index=False)

In [4]:
# Splitting

data_dia = data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'diabetes'])].copy()
data_tub =  data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'tuberculosis '])].copy()
data_dia_aga = data_dia.loc[data_dia['author'].apply(lambda x: x in ['agathiyar'])]
data_dia_the = data_dia.loc[data_dia['author'].apply(lambda x: x in ['therayar'])]
data_tub_aga = data_tub.loc[data_tub['author'].apply(lambda x: x in ['agathiyar'])]
data_tub_the = data_tub.loc[data_tub['author'].apply(lambda x: x in ['therayar'])]

In [5]:
data_dia

Unnamed: 0,drug,author,bot_name,disease_category
0,parangipattai choornam,agathiyar,smilax china,diabetes
1,parangipattai choornam,agathiyar,tinospora cordifolia,diabetes
2,aavarai kiyalam,agathiyar,cassia auriculata,diabetes
3,aavarai kiyalam,agathiyar,kalamadham,diabetes
4,aavarai kiyalam,agathiyar,cassis fistula,diabetes
...,...,...,...,...
725,kodasalak kulikai,therayar,coccinia grandis,diabetes
726,kodasalak kulikai,therayar,curculigo orchioides,diabetes
727,kodasalak kulikai,therayar,cyperus rotundus,diabetes
728,kodasalak kulikai,therayar,andrographis echioides,diabetes


In [6]:
def enc(data, name):
    def remove_duplicates(row):
        # numbers = row.split()
        unique_numbers = list(set(row))
        return unique_numbers
    # groupby drug and gather all bot_name with similar drug to an array
    grouped_data = data.groupby('drug')['bot_name'].apply(lambda x: x.tolist()).reset_index()
    # convert to series
    return pd.Series(grouped_data['bot_name']).apply(remove_duplicates).tolist()
    # return grouped_data


In [7]:
def apply_apriori(data, name:str, support:float=0.3, confidence:float=0.3):
    te = TransactionEncoder()
    da=enc(data, name)
    df = te.fit_transform(da, set_pandas=True)
    freq_items = apriori(df, min_support=support)
    if freq_items.empty:
        print(f'Frequent Itemsets for {name} is empty with support {support}')
    else:
        rules = association_rules(freq_items, metric="confidence", min_threshold=confidence)
    if rules.empty:
        print(f'Rules for {name} is empty with support {support} and confidence {confidence}')
        rules = pd.DataFrame(columns=['antecedents','consequents','antecedent','support','consequent','support','support','confidence',	'lift',	'kulc'])
    return rules

In [8]:
def apply_apriori(data, name:str, support:float=0.3, confidence:float=0.3):
    te = TransactionEncoder()
    da=enc(data, name)
    df = te.fit_transform(da, set_pandas=True)
    freq_items = apriori(df, min_support=support)
    if freq_items.empty:
        print(f'Frequent Itemsets for {name} is empty with support {support}')
    else:
        rules = association_rules(freq_items, metric="confidence", min_threshold=confidence)
    if rules.empty:
        print(f'Rules for {name} is empty with support {support} and confidence {confidence}')
        rules = pd.DataFrame(columns=['antecedents','consequents','antecedent','support','consequent','support','support','confidence',	'lift',	'kulc'])
    return rules

In [9]:
def apply_fpgrowth(data, name:str, support:float=0.3, confidence:float=0.3):
    te = TransactionEncoder()
    da=enc(data, name)
    df = te.fit_transform(da, set_pandas=True)
    freq_items = fpgrowth(df, min_support=support)
    if freq_items.empty:
        print(f'Frequent Itemsets for {name} is empty with support {support}')
    else:
        rules = association_rules(freq_items, metric="confidence", min_threshold=confidence)
    if rules.empty:
        print(f'Rules for {name} is empty with support {support} and confidence {confidence}')
        rules = pd.DataFrame(columns=['antecedents','consequents','antecedent','support','consequent','support','support','confidence',	'lift',	'kulc'])
    return rules

In [12]:
def apply_eclat(data, name:str, support:float=0.3, confidence:float=0.3):
    te = TransactionEncoder()
    da=enc(data, name)
    df = te.fit_transform(da, set_pandas=True)
    freq_items = eclat(df, min_support=support)
    if freq_items.empty:
        print(f'Frequent Itemsets for {name} is empty with support {support}')
    else:
        rules = association_rules(freq_items, metric="confidence", min_threshold=confidence)
    if rules.empty:
        print(f'Rules for {name} is empty with support {support} and confidence {confidence}')
        rules = pd.DataFrame(columns=['antecedents','consequents','antecedent','support','consequent','support','support','confidence',	'lift',	'kulc'])
    return rules

In [13]:
def apply_hmine(data, name:str, support:float=0.3, confidence:float=0.3):
    te = TransactionEncoder()
    da=enc(data, name)
    df = te.fit_transform(da, set_pandas=True)
    freq_items = hmine(df, min_support=support)
    if freq_items.empty:
        print(f'Frequent Itemsets for {name} is empty with support {support}')
    else:
        rules = association_rules(freq_items, metric="confidence", min_threshold=confidence)
    if rules.empty:
        print(f'Rules for {name} is empty with support {support} and confidence {confidence}')
        rules = pd.DataFrame(columns=['antecedents','consequents','antecedent','support','consequent','support','support','confidence',	'lift',	'kulc'])
    return rules

In [14]:
apply_hmine(data_dia, 'diabetes', support=0.1, confidence=0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,kulc
0,(trikatu),(costus speciosus),0.310345,0.155172,0.103448,0.333333,2.148148,0.5
1,(costus speciosus),(trikatu),0.155172,0.310345,0.103448,0.666667,2.148148,0.5
2,(triphala),(costus speciosus),0.241379,0.155172,0.103448,0.428571,2.761905,0.547619
3,(costus speciosus),(triphala),0.155172,0.241379,0.103448,0.666667,2.761905,0.547619
4,(elettaria cardamomum),(syzygium aromaticum),0.103448,0.137931,0.103448,1.0,7.25,0.875
5,(syzygium aromaticum),(elettaria cardamomum),0.137931,0.103448,0.103448,0.75,7.25,0.875
6,(trikatu),(syzygium aromaticum),0.310345,0.137931,0.103448,0.333333,2.416667,0.541667
7,(syzygium aromaticum),(trikatu),0.137931,0.310345,0.103448,0.75,2.416667,0.541667
8,(trikatu),(terminalia bellirica),0.310345,0.155172,0.137931,0.444444,2.864198,0.666667
9,(terminalia bellirica),(trikatu),0.155172,0.310345,0.137931,0.888889,2.864198,0.666667


In [15]:
apply_fpgrowth(data_dia, 'diabetes', support=0.1, confidence=0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,kulc
0,(trikatu),(costus speciosus),0.310345,0.155172,0.103448,0.333333,2.148148,0.5
1,(costus speciosus),(trikatu),0.155172,0.310345,0.103448,0.666667,2.148148,0.5
2,(triphala),(costus speciosus),0.241379,0.155172,0.103448,0.428571,2.761905,0.547619
3,(costus speciosus),(triphala),0.155172,0.241379,0.103448,0.666667,2.761905,0.547619
4,(trikatu),(syzygium aromaticum),0.310345,0.137931,0.103448,0.333333,2.416667,0.541667
5,(syzygium aromaticum),(trikatu),0.137931,0.310345,0.103448,0.75,2.416667,0.541667
6,(trikatu),(triphala),0.310345,0.241379,0.137931,0.444444,1.84127,0.507937
7,(triphala),(trikatu),0.241379,0.310345,0.137931,0.571429,1.84127,0.507937
8,(triphala),(terminalia bellirica),0.241379,0.155172,0.137931,0.571429,3.68254,0.730159
9,(terminalia bellirica),(triphala),0.155172,0.241379,0.137931,0.888889,3.68254,0.730159


In [15]:
all_data = {
    'Diabetic-Data-Overall': data_dia,
    'Diabetic-Data-Agathiyar': data_dia_aga,
    'Diabetic-Data-Therayar': data_dia_the,
    'Tuberculosis-Data-Overall': data_tub,
    'Tuberculosis-Data-Agathiyar': data_tub_aga,
    'Tuberculosis-Data-Therayar': data_tub_the
    }

for name, data in all_data.items():
    write_split_data(data, name)

NameError: name 'write_split_data' is not defined

In [7]:

def runSPMF(algorithm:str, name:str, support:int = 15):
    # Create algorithm folder if not exists
    if not os.path.exists(f'./output-data/{algorithm}'):
        os.makedirs(f'./output-data/{algorithm}')
    # if file does not exist create an empty file ./output-data/{algorithm}/{name}.txt 
    if not os.path.exists(f'./output-data/{algorithm}/{name}.txt'):
        open(f'./output-data/{algorithm}/{name}.txt', 'w').close()
    # Run SPMF
    os.system( f"java -jar spmf.jar run {algorithm} ./split-data/{name}.txt ./output-data/{algorithm}/{name}.txt {support}%")
    # Read the result from the output txt file
    with open(f'./output-data/{algorithm}/{name}.txt', 'r') as f:
        # Read output in format => 52 53  #SUP: 
        result = f.read()
    # Split the result into lines
    result = result.split('\n')
    out_dic = {}
    for row in result:
        # Get the bot_name and support value
        if not row:
            continue
        values = row.split(' #SUP: ')
        out_dic[str(values[0])] = values[1]
    out_df = pd.DataFrame.from_dict(out_dic, orient='index').reset_index()
    # if out_df is empty create aan empty dataframe with columns ['bot_name', 'support']
    if out_df.empty:
        out_df = pd.DataFrame(columns=['bot_name', 'support'])

    # Out_dic to df
    out_df.columns = ['bot_name', 'support']
    # Rename columns
    
    def to_bot_name(row):
        ids = row.split()
        bot_names = []
        for id in ids:
            bot_names.append(bot_name.loc[bot_name['id'] == int(id), 'bot_name'].values[0])
        return bot_names
    
    # out_df['bot_name'] = out_df['bot_name'].apply(to_bot_name)
    
    return out_df


In [8]:
def create_association_rules(df: pd.DataFrame, name: str, min_threshold: float = 0.7):
    
    df = df.copy()

    # Convert the 'support' column to integers
    try:
        df['bot_name'] = df['bot_name'].str.split()
    # Convert the 'bot_name' lists to frozensets
    except Exception as e:
        print(df)
        print(name)
        print(min_threshold)
        raise e
    df['support'] = df['support'].astype(int)
    df['bot_name'] = df['bot_name'].apply(frozenset)
    df.rename(columns={'bot_name': 'itemsets'}, inplace=True)
    # Create association rules
    if df.empty:
        rules = pd.DataFrame(columns=['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction'])
    else:
        rules = association_rules(df, metric='confidence', min_threshold=min_threshold)
        
    # If still empty print it in red color
    if rules.empty:
        print('\033[91m' + f'No rules found for {name} with min_threshold {min_threshold}' + '\033[0m')
    # Display the generated association rules
    return rules


In [31]:
# A function that calls run spmf and then create_association_rules to get association runs
import time
def mean_performance(algorithm:str, name:str):
    support_list = [10, 15, 20]
    confidence_list = [0.4, 0.5, 0.6]
    score_dic = {}
    performance_score = 0
    for support in support_list:
        supports = runSPMF(algorithm, name, support=support)
        for confidence in confidence_list:
            count = len(create_association_rules(supports, name, min_threshold=confidence))
            score_dic[str(support)+'_'+str(confidence)] = [len(supports), count]
            performance_score += count
            time.sleep(1)
    return score_dic
    return numpy.mean(performance_score/25)

In [48]:
algorithms = [
        'Apriori', 
        'FPGrowth_itemsets', 
        'Apriori_TID', 
        'Relim', 
        'Eclat'
        ]
ss = []
for algorithm in algorithms:
    ss.append(mean_performance(algorithm, 'Diabetic-Data-Overall').values())


>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 136
 The algorithm stopped at size 2
 Frequent itemsets count : 16
 Maximum memory usage : 7.8111724853515625 mb
 Total time ~ 15 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.4[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.5[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.6[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 10
 The algorithm stopped at size 2
 Frequent itemsets count : 4
 Maximum memory usage : 7.812141418457031 mb
 Total time ~ 19 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.4[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.5[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.6[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/s

In [49]:
pd.DataFrame(ss, index =algorithms ).T

Unnamed: 0,Apriori,FPGrowth_itemsets,Apriori_TID,Relim,Eclat
0,"[16, 0]","[24, 17]","[24, 17]","[24, 17]","[24, 17]"
1,"[16, 0]","[24, 14]","[24, 14]","[24, 14]","[24, 14]"
2,"[16, 0]","[24, 11]","[24, 11]","[24, 11]","[24, 11]"
3,"[4, 0]","[4, 0]","[4, 0]","[4, 0]","[4, 0]"
4,"[4, 0]","[4, 0]","[4, 0]","[4, 0]","[4, 0]"
5,"[4, 0]","[4, 0]","[4, 0]","[4, 0]","[4, 0]"
6,"[2, 0]","[2, 0]","[2, 0]","[2, 0]","[2, 0]"
7,"[2, 0]","[2, 0]","[2, 0]","[2, 0]","[2, 0]"
8,"[2, 0]","[2, 0]","[2, 0]","[2, 0]","[2, 0]"


In [34]:
apttemp = mean_performance('Apriori_TID', 'Diabetic-Data-Overall')

>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets count : 24
 Maximum memory usage : 7.5539703369140625 mb
 Total time ~ 31 ms
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets count : 4
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 8 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.4[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.5[0m
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.6[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Transactions count from database : 58
 Frequent itemsets count : 2
 Maximum memory usage : 7.8110809326171875 mb
 Total time ~ 7 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.4[0m
[91mNo rules found for Diabetic-Data-Ov

In [37]:
for key, value in apttemp.items():
    key = key.split('_')
    #
    print(key, value)

['10', '0.4'] [24, 17]
['10', '0.5'] [24, 14]
['10', '0.6'] [24, 11]
['15', '0.4'] [4, 0]
['15', '0.5'] [4, 0]
['15', '0.6'] [4, 0]
['20', '0.4'] [2, 0]
['20', '0.5'] [2, 0]
['20', '0.6'] [2, 0]


In [10]:
def get_algorithms_scores(name:str):
    algorithms = [
        'Apriori', 
        'FPGrowth_itemsets', 
        'Apriori_TID', 
        'Relim', 
        'Eclat'
        ]
    
    avg_scores = {}
    for algorithm in algorithms:
        avg_scores[algorithm] = mean_performance(algorithm, name=name)
    return avg_scores

In [11]:
# 'Diabetic-Data-Overall'
# 'Diabetic-Data-Agathiyar'
# 'Diabetic-Data-Therayar'
# 'Tuberculosis-Data-Overall'
# 'Tuberculosis-Data-Agathiyar'
# 'Tuberculosis-Data-Therayar'

# get_algorithms_scores('Diabetic-Data-Agathiyar')

In [12]:
data_scores = {}
for name, data in all_data.items():
    data_scores[name] = get_algorithms_scores(name=name)

>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 903
 The algorithm stopped at size 2
 Frequent itemsets count : 42
 Maximum memory usage : 7.8136444091796875 mb
 Total time ~ 34 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.4[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 903
 The algorithm stopped at size 2
 Frequent itemsets count : 42
 Maximum memory usage : 7.8136444091796875 mb
 Total time ~ 22 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.5[0m
>/media/harsh/Docs Volume/Documents/Data Science Community/herbal-analysis/spmf/spmf.jar
 Candidates count : 903
 The algorithm stopped at size 2
 Frequent itemsets count : 42
 Maximum memory usage : 7.8136444091796875 mb
 Total time ~ 22 ms
[91mNo rules found for Diabetic-Data-Overall with min_threshold 0.6[0m
>/media/harsh/Docs Volume/Documents/Data Science Commun

In [None]:
data_scores