In [4]:
import pandas as pd

# Assuming 'your_dataset.csv' is the name of your dataset CSV file
dataset = pd.read_csv(r'Herbals and preperations.csv', encoding='latin-1')  # Change the separator if needed

# Filter rows with disease 'Diabetes'
diabetes_data = dataset[dataset['disease_category'] == 'Diabetes']

# Remove specified columns
columns_to_remove = ['author', 'herb_part', 'disease_category', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
diabetes_data_filtered = diabetes_data.drop(columns=columns_to_remove)

# Save the filtered data to a new CSV file
diabetes_data_filtered.to_csv('diabetes_filtered_data.csv', index=False)


In [5]:
# Change the separator if needed

# Filter rows with disease 'Diabetes'
diabetes_data = dataset[dataset['disease_category'] == 'Diabetes']

# Group by common drugs and concatenate the 'bot_name' values
grouped_data = diabetes_data.groupby('drug')['bot_name'].apply(lambda x: ' '.join(x)).reset_index()

# Save the result to a text file
with open('output.txt', 'w') as f:
    for index, row in grouped_data.iterrows():
        f.write(row['bot_name'] + '\n')


In [4]:
class Relim:
    def __init__(self, min_support):
        self.min_support = min_support
        self.freq_itemsets = []

    def run_relim(self, transactions, prefix=None):
        if prefix is None:
            prefix = set()

        items = self.get_items(transactions)

        for item in items:
            new_prefix = prefix.copy()
            new_prefix.add(item)

            # Count the support of the new itemset
            support = self.count_support(transactions, new_prefix)

            if support >= self.min_support:
                # Add the frequent itemset to the result
                self.freq_itemsets.append(new_prefix)

                # Generate conditional database for the next recursion
                conditional_database = self.generate_conditional_database(transactions, new_prefix)

                if conditional_database:
                    # Recursively run the algorithm on the conditional database
                    self.run_relim(conditional_database, new_prefix)

    def get_items(self, transactions):
        items = set()
        for transaction in transactions:
            items.update(transaction)
        return items

    def count_support(self, transactions, itemset):
        count = 0
        for transaction in transactions:
            if itemset.issubset(transaction):
                count += 1
        return count

    def generate_conditional_database(self, transactions, itemset):
        conditional_database = []
        for transaction in transactions:
            if itemset.issubset(transaction):
                # Remove items in itemset from the transaction
                new_transaction = transaction - itemset
                conditional_database.append(new_transaction)
        return conditional_database


# Preprocess the dataset
import csv

dataset = []

with open(r'initial-data/Herbals and preperations.csv', newline='', encoding='latin-1') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Consider specific columns as items in each transaction
        transaction = set([row['taste'], row['potency'], row['ultimate_taste']])
        dataset.append(transaction)

# Set the minimum support threshold
min_support = 2

# Run the Relim algorithm
relim = Relim(min_support)
relim.run_relim(dataset)

# Display frequent itemsets
print("Frequent Itemsets:")
for itemset in relim.freq_itemsets:
    print(itemset)


Frequent Itemsets:
{'Bitter, Astringent, Pungent'}
{'Sweet '}
{'Hot'}
{'Bitter, karakarapu'}
{'Sweet, spicy'}
{'Bitter'}
{'Astringent, Bitter '}
{'Bitter, Astringent '}
{'mildBitter , Astringent'}
{'sweet, Astringent '}
{'Bitter , Pungent'}
{'sweet, mildBitter '}
{'Pungent , sweet'}
{'pungent , Sweet'}
{'Pungent , Sweet'}
{'Pungent, Bitter'}
{'Pungent '}
{'Bitter , Viruvirupu'}
{'Bitter , mildAstringent '}
{'Astringent, sweet'}
{'Astringent '}
{'Astringent, viruvirupu '}
{'Bitter , Arpapungent'}
{'Bitter , Pungent '}
{'sweet'}
{'Sour, Astringent, sweet'}
{'Bitter , Astringent'}
{'Bitter(veguttal)'}
{'Bitter (kumatal)'}
{'Pungent'}
{'Astringent , mildBitter '}
{'pungent , Viruvirupu'}
{'null'}
{'Bitter , Astringent '}
{'Bitter, Pungent '}
{'Pungent, viruvirupu'}
{'Cold '}
{'sweet, Pungent'}
{'Bitter, Astringent'}
{'mildBitter, Pungent'}
{'Sweet , Pungent'}
{'Sour'}
{'Astringent , Bitter'}
{'Astringent'}
{'pungent '}
{'hot'}
{'Astringent , Pungent'}
{'sweet, mildBitter'}
{'Sweet'}
{'Spic

In [4]:
class Relim:
    def __init__(self, min_support, min_confidence):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.freq_itemsets = []
        self.association_rules = []
        self.transactions = []

    def run_relim(self, transactions, prefix=None):
        if prefix is None:
            prefix = set()

        items = self.get_items(transactions)

        for item in items:
            new_prefix = prefix.copy()
            new_prefix.add(item)

            # Count the support of the new itemset
            support = self.count_support(transactions, new_prefix)

            if support >= self.min_support:
                # Add the frequent itemset to the result
                self.freq_itemsets.append((new_prefix, support))

                # Generate conditional database for the next recursion
                conditional_database = self.generate_conditional_database(transactions, new_prefix)

                if conditional_database:
                    # Recursively run the algorithm on the conditional database
                    self.run_relim(conditional_database, new_prefix)

    def generate_association_rules(self):
        for itemset, support in self.freq_itemsets:
            if len(itemset) > 1:
                self.generate_rules_from_itemset(itemset, support)

    def generate_rules_from_itemset(self, itemset, support):
        for i in range(1, len(itemset)):
            antecedent = set(itemset[:i])
            consequent = set(itemset[i:])

            confidence = support / self.count_support(self.transactions, antecedent)

            if confidence >= self.min_confidence:
                rule = (antecedent, consequent, confidence)
                self.association_rules.append(rule)

    def get_items(self, transactions):
        items = set()
        for transaction in transactions:
            items.update(transaction)
        return items

    def count_support(self, transactions, itemset):
        count = 0
        for transaction in transactions:
            if itemset.issubset(transaction):
                count += 1
        return count

    def generate_conditional_database(self, transactions, itemset):
        conditional_database = []
        for transaction in transactions:
            if itemset.issubset(transaction):
                # Remove items in itemset from the transaction
                new_transaction = transaction - itemset
                conditional_database.append(new_transaction)
        return conditional_database


# Preprocess the dataset
import csv

dataset = []

with open(r'initial-data/Herbals and preperations.csv', newline='', encoding='latin-1') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Consider specific columns as items in each transaction
        transaction = set([row['taste'], row['potency'], row['ultimate_taste']])
        dataset.append(transaction)

# Set the minimum support and confidence thresholds
min_support = 0.1
min_confidence = 0.7

# Run the Relim algorithm
relim = Relim(min_support, min_confidence)
relim.transactions = dataset
relim.run_relim(dataset)
relim.generate_association_rules()

# Extract unique disease categories
disease_categories = set()
for row in dataset:
    disease_categories.add(row['disease_category'])

# Display frequent itemsets and their support for each disease
for disease in disease_categories:
    print(f"\nFrequent Itemsets for {disease}:")
    print(",support,itemsets")
    index = 0
    for itemset, support in relim.freq_itemsets:
        if any(keyword.lower() in disease.lower() for keyword in itemset):
            print(f"{index},{support:.6f},{itemset}")
            index += 1


TypeError: 'set' object is not subscriptable

In [10]:
from mlxtend.preprocessing import TransactionEncoder
from pymining import itemmining
import pandas as pd
data = pd.read_csv(r'initial-data/Herbals and preperations.csv', encoding='latin-1')
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
data = data.join(pd.get_dummies(data['author'].str.lower().str.strip(), prefix='Author'))
data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data=data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
data.loc[data.bot_name.isin(['terminalia chebula', 'terminalia bellarica', 'phyllanthus emblica']), 'bot_name'] = 'triphala'
data.loc[data.bot_name.isin(['piper nigrum', 'piper longum', 'zingiber officinale']), 'bot_name'] = 'trikatu'
data.loc[(data.disease_category == 'diabetes') & (data.bot_name == 'saccharum officinarum'), 'bot_name'] = 'tinospora cordifolia'
data_dia = data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'diabetes'])].copy()
data_tub =  data.loc[data['disease_category'].apply(lambda x: x in ['diab/ tb ', 'tuberculosis '])].copy()
data_dia_aga = data_dia.loc[data_dia['author'].apply(lambda x: x in ['agathiyar'])]
data_dia_the = data_dia.loc[data_dia['author'].apply(lambda x: x in ['therayar'])]
data_tub_aga = data_tub.loc[data_tub['author'].apply(lambda x: x in ['agathiyar'])]
data_tub_the = data_tub.loc[data_tub['author'].apply(lambda x: x in ['therayar'])]
def apply_relim(df):
    # Convert the dataset to the format expected by Relim
    transactions = df.groupby(['drug', 'bot_name']).size().unstack(fill_value=0)
    transactions = transactions.applymap(lambda x: 1 if x > 0 else 0)
    transaction_list = transactions.values.tolist()

    # Perform Relim
    relim_results = itemmining.relim(transaction_list, (0.1, len(transaction_list)))

    # Convert the result to a DataFrame
    itemsets_df = pd.DataFrame([(frozenset(k), v) for k, v in relim_results.items()], columns=['itemsets', 'support'])
    return itemsets_df

def display_relim_results(itemsets_df, count=3, measure='support'):
    itemsets_drugs = itemsets_df[itemsets_df['itemsets'].apply(lambda x: len(x) == count)]
    itemsets_drugs = itemsets_drugs.sort_values(by=measure, ascending=False)
    return itemsets_drugs

# Apply Relim
frequent_itemset_relim = apply_relim(data_dia_aga)

# Display results for different group counts
display_relim_results(frequent_itemset_relim, 3, 'support')
display_relim_results(frequent_itemset_relim, 4, 'support')
display_relim_results(frequent_itemset_relim, 5, 'support')

# Note: Since pymining does not provide direct support for association rules, you may need to implement that part separately.


  transactions = transactions.applymap(lambda x: 1 if x > 0 else 0)


ValueError: too many values to unpack (expected 2)

In [9]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

# Load your dataset
data = pd.read_csv(r'initial-data/Herbals and preperations.csv', encoding='latin-1')

# Preprocess the data
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
data = data.join(pd.get_dummies(data['author'].str.lower().str.strip(), prefix='Author'))
data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

# Filter data for author "Agathiyar" and disease category "Diabetes"
data_dia_aga = data[(data['author'] == 'agathiyar') & (data['disease_category'].str.lower().str.strip() == 'diabetes')]

# Extract relevant columns for itemset mining
columns_of_interest = ['drug', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
data_for_mining = data_dia_aga[columns_of_interest]

# Convert NaN values to a placeholder (e.g., 'unknown')
data_for_mining = data_for_mining.fillna('unknown')

# Convert the dataset to the format expected by mlxtend
te = TransactionEncoder()
te_ary = te.fit(data_for_mining.values).transform(data_for_mining.values)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Task 1: Compute Frequent Item Set using mlxtend.frequent_patterns.fpgrowth
frequent_itemsets = fpgrowth(df, min_support=0.1, use_colnames=True)

# Task 2: Find Closed frequent itemset using frequent itemset found in Task 1
def closed_frequent_itemsets(frequent_itemsets):
    closed_itemsets = []
    for index, row in frequent_itemsets.iterrows():
        is_closed = True
        itemset = row['itemsets']
        support = row['support']
        for _, other_row in frequent_itemsets.iterrows():
            if index != _:
                other_itemset = other_row['itemsets']
                other_support = other_row['support']
                if itemset.issubset(other_itemset) and support == other_support:
                    is_closed = False
                    break
        if is_closed:
            closed_itemsets.append({'itemsets': list(itemset), 'support': support})
    return closed_itemsets

# Task 3: Display the closed frequent itemsets
closed_itemsets = closed_frequent_itemsets(frequent_itemsets)
closed_df = pd.DataFrame(closed_itemsets)
print('Closed Frequent Itemsets:')
print(closed_df)


Closed Frequent Itemsets:
                      itemsets   support
0                      [sweet]  0.341176
1                      [cold ]  0.335294
2                    [unknown]  0.329412
3                        [hot]  0.458824
4                     [spicy ]  0.447059
5                     [bitter]  0.111765
6                    [legyam ]  0.205882
7            [muyal kirutham ]  0.105882
8         [kool panda legyam ]  0.141176
9   [thalisapathiri choornam ]  0.152941
10              [sweet, cold ]  0.288235
11               [spicy , hot]  0.405882
12       [spicy , bitter, hot]  0.105882
13           [spicy , legyam ]  0.117647
14              [legyam , hot]  0.105882
15      [spicy , legyam , hot]  0.100000


In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import os
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv(r'initial-data/Herbals and preperations.csv', encoding='latin-1')

# Preprocess the data
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
data = data.join(pd.get_dummies(data['author'].str.lower().str.strip(), prefix='Author'))
data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

# Create a directory to store outputs
output_dir = 'association_rules_outputs'
os.makedirs(output_dir, exist_ok=True)

# Define categories based on 'author' and 'disease_category'
categories = data.groupby(['author', 'disease_category'])

# Process each category
for category_name, category_data in categories:
    author, disease_category = category_name

    # Extract relevant columns for itemset mining
    columns_of_interest = ['drug', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
    data_for_mining = category_data[columns_of_interest]

    # Convert NaN values to a placeholder (e.g., 'unknown')
    data_for_mining = data_for_mining.fillna('unknown')

    # Convert the dataset to the format expected by mlxtend
    te = TransactionEncoder()
    te_ary = te.fit(data_for_mining.values).transform(data_for_mining.values)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # Task 1: Compute Frequent Item Set using mlxtend.frequent_patterns.fpgrowth
    frequent_itemsets = fpgrowth(df, min_support=0.1, use_colnames=True)

    # Task 2: Find Closed frequent itemset using frequent itemset found in Task 1
    def closed_frequent_itemsets(frequent_itemsets):
        closed_itemsets = []
        for index, row in frequent_itemsets.iterrows():
            is_closed = True
            itemset = row['itemsets']
            support = row['support']
            for _, other_row in frequent_itemsets.iterrows():
                if index != _:
                    other_itemset = other_row['itemsets']
                    other_support = other_row['support']
                    if itemset.issubset(other_itemset) and support == other_support:
                        is_closed = False
                        break
            if is_closed:
                closed_itemsets.append({'itemsets': list(itemset), 'support': support})
        return closed_itemsets

    # Task 3: Display the closed frequent itemsets
    closed_itemsets = closed_frequent_itemsets(frequent_itemsets)
    closed_df = pd.DataFrame(closed_itemsets)

    # Save the closed frequent itemsets to CSV
    closed_csv_path = os.path.join(output_dir, f'closed_itemsets_{author}_{disease_category}.csv')
    closed_df.to_csv(closed_csv_path, index=False)

    # Task 4: Generate Association Rules
    rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)

    # Save the association rules to CSV
    rules_csv_path = os.path.join(output_dir, f'association_rules_{author}_{disease_category}.csv')
    rules.to_csv(rules_csv_path, index=False)

    # Task 5: Generate and Save Graphs
    plt.figure(figsize=(10, 5))

    # Plot Support vs. Itemsets
    plt.subplot(1, 2, 1)
    plt.barh(range(len(frequent_itemsets)), frequent_itemsets['support'], align='center')
    plt.yticks(range(len(frequent_itemsets)), frequent_itemsets['itemsets'])
    plt.xlabel('Support')
    plt.title('Support vs. Itemsets')

    # Plot Confidence vs. Lift for Association Rules
    plt.subplot(1, 2, 2)
    plt.scatter(rules['confidence'], rules['lift'])
    plt.xlabel('Confidence')
    plt.ylabel('Lift')
    plt.title('Confidence vs. Lift for Association Rules')

    # Save the graph
    graph_path = os.path.join(output_dir, f'association_rules_graph_{author}_{disease_category}.png')
    plt.savefig(graph_path)
    plt.close()

print('Processing completed.')


OSError: Cannot save file into a non-existent directory: 'association_rules_outputs\closed_itemsets_agathiyar_diab'

In [22]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import os
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Herbals and preperations.csv', encoding='latin-1')

# Preprocess the data
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()
data = data.join(pd.get_dummies(data['author'].str.lower().str.strip(), prefix='Author'))
data['author'] = data['author'].dropna(axis=0).str.strip()
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data = data.dropna(subset=['bot_name'])
data['bot_name'] = data['bot_name'].str.strip()

# Define the output directory
output_dir = 'association_rules_outputs'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define categories based on 'author' and 'disease_category'
categories = data.groupby(['author', 'disease_category'])

# Process each category
for category_name, category_data in categories:
    author, disease_category = category_name

    # Extract relevant columns for itemset mining
    columns_of_interest = ['drug', 'taste', 'potency', 'ultimate_taste', 'inherent_action']
    data_for_mining = category_data[columns_of_interest]

    # Convert NaN values to a placeholder (e.g., 'unknown')
    data_for_mining = data_for_mining.fillna('unknown')

    # Convert the dataset to the format expected by mlxtend
    te = TransactionEncoder()
    te_ary = te.fit(data_for_mining.values).transform(data_for_mining.values)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # Task 1: Compute Frequent Item Set using mlxtend.frequent_patterns.fpgrowth
    frequent_itemsets = fpgrowth(df, min_support=0.1, use_colnames=True)

    # Task 2: Find Closed frequent itemset using frequent itemset found in Task 1
    def closed_frequent_itemsets(frequent_itemsets):
        closed_itemsets = []
        for index, row in frequent_itemsets.iterrows():
            is_closed = True
            itemset = row['itemsets']
            support = row['support']
            for _, other_row in frequent_itemsets.iterrows():
                if index != _:
                    other_itemset = other_row['itemsets']
                    other_support = other_row['support']
                    if itemset.issubset(other_itemset) and support == other_support:
                        is_closed = False
                        break
            if is_closed:
                closed_itemsets.append({'itemsets': list(itemset), 'support': support})
        return closed_itemsets

    # Task 3: Display the closed frequent itemsets
    closed_itemsets = closed_frequent_itemsets(frequent_itemsets)
    closed_df = pd.DataFrame(closed_itemsets)

    # Save the closed frequent itemsets to CSV
    closed_csv_path = os.path.join(output_dir, f'closed_itemsets_{author}_{disease_category}.csv')
    closed_df.to_csv(closed_csv_path, index=False)

    # Task 4: Generate Association Rules
    rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)

    # Save the association rules to CSV
    rules_csv_path = os.path.join(output_dir, f'association_rules_{author}_{disease_category}.csv')
    rules.to_csv(rules_csv_path, index=False)

    # Task 5: Generate and Save Graphs
    plt.figure(figsize=(10, 5))

    # Plot Support vs. Itemsets
    plt.subplot(1, 2, 1)
    plt.barh(range(len(frequent_itemsets)), frequent_itemsets['support'], align='center')
    plt.yticks(range(len(frequent_itemsets)), frequent_itemsets['itemsets'])
    plt.xlabel('Support')
    plt.title('Support vs. Itemsets')

    # Plot Confidence vs. Lift for Association Rules
    plt.subplot(1, 2, 2)
    plt.scatter(rules['confidence'], rules['lift'])
    plt.xlabel('Confidence')
    plt.ylabel('Lift')
    plt.title('Confidence vs. Lift for Association Rules')

    # Save the graph
    graph_path = os.path.join(output_dir, f'association_rules_graph_{author}_{disease_category}.png')
    plt.savefig(graph_path)
    plt.close()

print('Processing completed.')


OSError: Cannot save file into a non-existent directory: 'association_rules_outputs\closed_itemsets_agathiyar_diab'