In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from collections import Counter
from itertools import chain
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
from wordcloud import WordCloud

In [3]:
file_path = "C:/Users/AKSHAYA/OneDrive/Documents/AmazonReviews/cleaned.csv"  # Update with actual file path

df = pd.read_csv(file_path)


In [4]:
df.head

<bound method NDFrame.head of          Unnamed: 0  rating  \
0                 0     5.0   
1                 1     5.0   
2                 2     2.0   
3                 3     1.0   
4                 4     5.0   
...             ...     ...   
2500934     2500934     5.0   
2500935     2500935     1.0   
2500936     2500936     3.0   
2500937     2500937     5.0   
2500938     2500938     1.0   

                                                     title  \
0                                            Pretty locket   
1                                                        A   
2                                                Two Stars   
3                                          Won‚Äôt buy again   
4                                     I LOVE these glasses   
...                                                    ...   
2500934  ... allowed them to be used to add military ri...   
2500935                           Didn‚Äôt come with all ten   
2500936                            

In [60]:
# Count reviews per reviewer
review_counts = df['user_id'].value_counts()

# Map reviewerID to 'Frequent' or 'Infrequent'
df['reviewer_type'] = df['user_id'].map(lambda x: 'Frequent' if review_counts[x] >= 10 else 'Infrequent')


In [61]:
def get_fashion_category(title):
    if not isinstance(title, str):
        return 'Unknown'

    title = title.lower()

    categories = {
        'Tops & Tees': ['t-shirt', 'tee', 'top', 'tank', 'blouse', 'shirt'],
        'Bottoms': ['jeans', 'pants', 'trousers', 'leggings', 'shorts', 'skirt'],
        'Dresses': ['dress', 'gown', 'frock', 'kurti'],
        'Outerwear': ['jacket', 'coat', 'hoodie', 'sweater', 'blazer', 'shrug'],
        'Ethnic Wear': ['saree', 'salwar', 'lehenga', 'kurta', 'kurti', 'dupatta'],
        'Footwear': ['shoes', 'sneakers', 'heels', 'sandals', 'flats', 'slippers', 'boots'],
        'Lingerie & Sleepwear': ['bra', 'panty', 'lingerie', 'nightwear', 'sleepwear', 'camisole', 'nightdress'],
        'Accessories': ['belt', 'cap', 'hat', 'sunglasses', 'scarf', 'watch', 'bag', 'handbag', 'wallet','locket','glasses'],
        'Activewear': ['sports bra', 'gym', 'trackpant', 'activewear', 'joggers'],
        'Kidswear': ['kids', 'child', 'baby', 'infant'],
        'Menswear': ['men', "men's", 'gent'],
        'Womenswear': ['women', "women's", 'lady', 'ladies']
    }

    for category, keywords in categories.items():
        if any(keyword in title for keyword in keywords):
            return category
    return 'Unknown'
if 'title' in df.columns:
    df['fashion_category'] = df['title'].apply(get_fashion_category)
else:
    df['fashion_category'] = 'Unknown'



In [62]:

# Generalize Rating
rating_hierarchy = {
    1: 'Low',
    2: 'Low',
    3: 'Medium',
    4: 'High',
    5: 'High'
}
df['rating_generalized'] = df['rating'].map(rating_hierarchy)

# Add Sentiment based on rating
def map_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(map_sentiment)

# Generalize Review Length
def categorize_length(length):
    if length < 50:
        return 'Short'
    elif 50 <= length <= 100:
        return 'Medium'
    else:
        return 'Long'

df['review_length_category'] = df['cleaned_text'].apply(
    lambda x: categorize_length(len(str(x).split()))
)

# Generalize Helpful Votes
def generalize_helpful(votes):
    if votes == 0:
        return 'None'
    elif 1 <= votes <= 5:
        return 'Less Helpful'
    else:
        return 'More Helpful'

df['helpful_vote_category'] = df['helpful_vote'].apply(generalize_helpful)

# Generalize Verified Purchase
df['verified_status'] = df['verified_purchase'].map({True: 'Verified', False: 'Not Verified'})



# Final selection of generalized attributes
generalized_df = df[[
    'rating_generalized',
    'sentiment',
    'review_length_category',
    'helpful_vote_category',
    'verified_status',
    'reviewer_type',
    'fashion_category'
]]

# Display top rows
print(generalized_df.head())


  rating_generalized sentiment review_length_category helpful_vote_category  \
0               High  Positive                  Short          Less Helpful   
1               High  Positive                  Short                  None   
2                Low  Negative                  Short          Less Helpful   
3                Low  Negative                  Short          Less Helpful   
4               High  Positive                  Short                  None   

  verified_status reviewer_type fashion_category  
0        Verified    Infrequent      Accessories  
1        Verified    Infrequent          Unknown  
2        Verified    Infrequent          Unknown  
3        Verified    Infrequent          Unknown  
4        Verified    Infrequent      Accessories  


In [63]:
summary = df.groupby(
    ["reviewer_type", "fashion_category", "rating_generalized", "sentiment", 
     "review_length_category", "helpful_vote_category", "verified_status"]
).agg(
    review_count=("rating", "count"),
    avg_rating=("rating", "mean"),
    avg_helpful_vote=("helpful_vote", "mean")
).reset_index()


In [64]:
summary

Unnamed: 0,reviewer_type,fashion_category,rating_generalized,sentiment,review_length_category,helpful_vote_category,verified_status,review_count,avg_rating,avg_helpful_vote
0,Frequent,Accessories,High,Positive,Long,Less Helpful,Not Verified,61,4.606557,1.770492
1,Frequent,Accessories,High,Positive,Long,Less Helpful,Verified,10,4.700000,2.500000
2,Frequent,Accessories,High,Positive,Long,More Helpful,Not Verified,11,4.909091,24.818182
3,Frequent,Accessories,High,Positive,Long,More Helpful,Verified,6,4.333333,15.500000
4,Frequent,Accessories,High,Positive,Long,,Not Verified,77,4.584416,0.000000
...,...,...,...,...,...,...,...,...,...,...
1003,Infrequent,Womenswear,Medium,Neutral,Medium,,Verified,7,3.000000,0.000000
1004,Infrequent,Womenswear,Medium,Neutral,Short,Less Helpful,Verified,30,3.000000,1.433333
1005,Infrequent,Womenswear,Medium,Neutral,Short,More Helpful,Verified,6,3.000000,11.500000
1006,Infrequent,Womenswear,Medium,Neutral,Short,,Not Verified,11,3.000000,0.000000


In [65]:
# Convert the summary dataframe into a transaction-style format
summary_trans = summary.apply(lambda row: [
    f"Reviewer_Type={row['reviewer_type']}",
    f"Fashion_Category={row['fashion_category']}",  # Added fashion_category
    f"Rating={row['rating_generalized']}",
    f"Sentiment={row['sentiment']}",
    f"Review_Length={row['review_length_category']}",
    f"Helpfulness={row['helpful_vote_category']}",
    f"Verified={row['verified_status']}"
], axis=1)

# Convert the list of transactions to the right format for rule mining
summary_trans = summary_trans.tolist()

# Example: Print the first 5 transactions
print("Sample Transactions (first 5):", summary_trans[:5])


Sample Transactions (first 5): [['Reviewer_Type=Frequent', 'Fashion_Category=Accessories', 'Rating=High', 'Sentiment=Positive', 'Review_Length=Long', 'Helpfulness=Less Helpful', 'Verified=Not Verified'], ['Reviewer_Type=Frequent', 'Fashion_Category=Accessories', 'Rating=High', 'Sentiment=Positive', 'Review_Length=Long', 'Helpfulness=Less Helpful', 'Verified=Verified'], ['Reviewer_Type=Frequent', 'Fashion_Category=Accessories', 'Rating=High', 'Sentiment=Positive', 'Review_Length=Long', 'Helpfulness=More Helpful', 'Verified=Not Verified'], ['Reviewer_Type=Frequent', 'Fashion_Category=Accessories', 'Rating=High', 'Sentiment=Positive', 'Review_Length=Long', 'Helpfulness=More Helpful', 'Verified=Verified'], ['Reviewer_Type=Frequent', 'Fashion_Category=Accessories', 'Rating=High', 'Sentiment=Positive', 'Review_Length=Long', 'Helpfulness=None', 'Verified=Not Verified']]


In [66]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# Create a TransactionEncoder object to convert the transaction data
te = TransactionEncoder()
te_ary = te.fit(summary_trans).transform(summary_trans)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

# Apply apriori algorithm with a lower support threshold (e.g., 0.1)
frequent_itemsets = apriori(df_trans, min_support=0.1, use_colnames=True)

# Generate association rules based on confidence with a minimum threshold of 0.6
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# === Step 2: Filter Rules Where Antecedent Contains 'Reviewer_Type=Frequent' or 'Reviewer_Type=Infrequent' ===
reviewer_type_rules = rules[rules['antecedents'].apply(
    lambda x: any('Reviewer_Type=Frequent' in str(i) or 'Reviewer_Type=Infrequent' in str(i) for i in x))
]

# Sort the rules by lift in descending order
reviewer_type_rules = reviewer_type_rules.sort_values(by='lift', ascending=False)

# === Step 3: Display Rules ===
print("\nüìã Reviewer Type-Based Discrimination Rules:\n")
if not reviewer_type_rules.empty:
    for _, row in reviewer_type_rules.iterrows():
        antecedent = ' AND '.join(list(row['antecedents']))
        consequent = ' AND '.join(list(row['consequents']))
        print(f"üëâ IF {antecedent} THEN {consequent}")
        print(f"   - Support: {round(row['support'], 3)}")
        print(f"   - Confidence: {round(row['confidence'], 3)}")
        print(f"   - Lift: {round(row['lift'], 3)}\n")
else:
    print("No rules generated.")

# === Step 4: Summary Table ===
summary_df = reviewer_type_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))

# Save the summary to a CSV file
summary_df.to_csv('reviewer_type_discrimination_rules_summary.csv', index=False)

# Print the summary of the rules
print("\nüìÑ Summary of Reviewer Type Discrimination Rules:\n")
print(summary_df.head(10))



üìã Reviewer Type-Based Discrimination Rules:

üëâ IF Reviewer_Type=Infrequent AND Rating=Medium THEN Sentiment=Neutral
   - Support: 0.186
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Reviewer_Type=Frequent AND Rating=Medium THEN Sentiment=Neutral
   - Support: 0.111
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Verified=Verified AND Reviewer_Type=Infrequent AND Rating=Medium THEN Sentiment=Neutral
   - Support: 0.103
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Reviewer_Type=Infrequent AND Sentiment=Neutral THEN Rating=Medium
   - Support: 0.186
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Reviewer_Type=Frequent AND Sentiment=Neutral THEN Rating=Medium
   - Support: 0.111
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Reviewer_Type=Infrequent AND Sentiment=Neutral AND Verified=Verified THEN Rating=Medium
   - Support: 0.103
   - Confidence: 1.0
   - Lift: 3.371

üëâ IF Reviewer_Type=Infrequent AND Rating=Low THEN Sentiment=Negative
   - Support: 0.195
   - Confidence: 1.0


In [67]:
# Apply apriori with a lower support threshold (e.g., 0.05)
frequent_itemsets = apriori(df_trans, min_support=0.05, use_colnames=True)

# Generate association rules based on confidence with a minimum threshold of 0.2
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# === Step 1: Filter for 'Reviewer_Type=Frequent' in antecedents ===
frequent_reviewer_rules = rules[rules['antecedents'].apply(
    lambda x: 'Reviewer_Type=Frequent' in str(x) and len(x) == 1)]  # Strict filter for 'Reviewer_Type=Frequent' alone

# === Step 2: Filter for 'Reviewer_Type=Infrequent' in antecedents ===
infrequent_reviewer_rules = rules[rules['antecedents'].apply(
    lambda x: 'Reviewer_Type=Infrequent' in str(x) and len(x) == 1)]  # Strict filter for 'Reviewer_Type=Infrequent' alone

# === Step 3: Sort the rules by lift in descending order ===
frequent_reviewer_rules = frequent_reviewer_rules.sort_values(by='lift', ascending=False)
infrequent_reviewer_rules = infrequent_reviewer_rules.sort_values(by='lift', ascending=False)

# === Step 4: Display Rules for Frequent Reviewer Type ===
print("\nüìã Reviewer Type-Based Discrimination Rules (Strictly 'Reviewer_Type=Frequent'):\n")
if not frequent_reviewer_rules.empty:
    for _, row in frequent_reviewer_rules.iterrows():
        antecedent = ' AND '.join(list(row['antecedents']))
        consequent = ' AND '.join(list(row['consequents']))
        print(f"üëâ IF {antecedent} THEN {consequent}")
        print(f"   - Support: {round(row['support'], 3)}")
        print(f"   - Confidence: {round(row['confidence'], 3)}")
        print(f"   - Lift: {round(row['lift'], 3)}\n")
else:
    print("No rules generated for 'Reviewer_Type=Frequent'.")

# === Step 5: Display Rules for Infrequent Reviewer Type ===
print("\nüìã Reviewer Type-Based Discrimination Rules (Strictly 'Reviewer_Type=Infrequent'):\n")
if not infrequent_reviewer_rules.empty:
    for _, row in infrequent_reviewer_rules.iterrows():
        antecedent = ' AND '.join(list(row['antecedents']))
        consequent = ' AND '.join(list(row['consequents']))
        print(f"üëâ IF {antecedent} THEN {consequent}")
        print(f"   - Support: {round(row['support'], 3)}")
        print(f"   - Confidence: {round(row['confidence'], 3)}")
        print(f"   - Lift: {round(row['lift'], 3)}\n")
else:
    print("No rules generated for 'Reviewer_Type=Infrequent'.")

# === Step 6: Summary Tables ===
# Create summary tables for each filter
frequent_summary_df = frequent_reviewer_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
frequent_summary_df['antecedents'] = frequent_summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
frequent_summary_df['consequents'] = frequent_summary_df['consequents'].apply(lambda x: ' AND '.join(x))

infrequent_summary_df = infrequent_reviewer_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
infrequent_summary_df['antecedents'] = infrequent_summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
infrequent_summary_df['consequents'] = infrequent_summary_df['consequents'].apply(lambda x: ' AND '.join(x))

# Save the summaries to separate CSV files
frequent_summary_df.to_csv('reviewer_type_frequent_discrimination_rules_summary.csv', index=False)
infrequent_summary_df.to_csv('reviewer_type_infrequent_discrimination_rules_summary.csv', index=False)

# Print the summary of the rules
print("\nüìÑ Summary of Reviewer Type Discrimination Rules :\n")
print(frequent_summary_df.head(10))

print("\nüìÑ Summary of Reviewer Type Discrimination Rules :\n")
print(infrequent_summary_df.head(10))



üìã Reviewer Type-Based Discrimination Rules (Strictly 'Reviewer_Type=Frequent'):

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Verified=Not Verified AND Rating=High
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183

üëâ IF Reviewer_Type=Frequent THEN Verified=Not Verified AND Rating=High
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183

üëâ IF Reviewer_Type=Frequent THEN Verified=Not Verified AND Helpfulness=None
   - Support: 0.092
   - Confidence: 0.23
   - Lift: 1.169

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive
   - Support: 0.183
   - Confidence: 0.454
   - Lift: 1.136

üëâ IF Reviewer_Type=Frequent THEN Rating=High
   - Support: 0.183
   - Confidence: 0.454
   - Lift: 1.136

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Rating=High
   - Support: 0.183
   - Confidence: 0.454
  

In [68]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import os

# Re-run Apriori with lower thresholds to capture more rules
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Create output directory
os.makedirs("fashion_category_rules", exist_ok=True)

# Define fashion categories
fashion_categories = [
    'Accessories', 'Tops & Tees', 'Bottoms', 'Dresses', 'Outerwear', 'Ethnic Wear',
    'Footwear', 'Lingerie & Sleepwear', 'Activewear', 'Kidswear', 'Menswear', 'Womenswear'
]

print("\nüéØ Generating Discrimination Rules Based on Fashion_Category \n")

for category in fashion_categories:
    cat_tag = f"Fashion_Category={category}"
    
    # Strict filter: only that category alone in antecedent
    cat_rules = rules[rules['antecedents'].apply(
        lambda x: cat_tag in str(x) and len(x) == 1
    )].sort_values(by='lift', ascending=False)

    print(f"\nüìÇ Rules for '{cat_tag}':")
    if not cat_rules.empty:
        for _, row in cat_rules.iterrows():
            antecedent = ' AND '.join(list(row['antecedents']))
            consequent = ' AND '.join(list(row['consequents']))
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}\n")
        
        # Save summary to CSV
        summary_df = cat_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
        summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
        summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))
        summary_df.to_csv(f"fashion_category_rules/{category}_category_rules_summary.csv", index=False)
    else:
        print(f"‚ùå No rules generated for category '{category}'.")



üéØ Generating Discrimination Rules Based on Fashion_Category 


üìÇ Rules for 'Fashion_Category=Accessories':
üëâ IF Fashion_Category=Accessories THEN Reviewer_Type=Frequent AND Verified=Verified
   - Support: 0.023
   - Confidence: 0.235
   - Lift: 1.201

üëâ IF Fashion_Category=Accessories THEN Reviewer_Type=Frequent
   - Support: 0.045
   - Confidence: 0.459
   - Lift: 1.143

üëâ IF Fashion_Category=Accessories THEN Sentiment=Negative
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109

üëâ IF Fashion_Category=Accessories THEN Rating=Low
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109

üëâ IF Fashion_Category=Accessories THEN Rating=Low AND Sentiment=Negative
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109

üëâ IF Fashion_Category=Accessories THEN Review_Length=Long
   - Support: 0.031
   - Confidence: 0.316
   - Lift: 1.088

üëâ IF Fashion_Category=Accessories THEN Verified=Not Verified AND Reviewer_Type=Frequent
   - Support: 0.022
   - C

In [69]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import os

# Run Apriori (adjust thresholds as needed)
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Create output directory
os.makedirs("verified_status_rules", exist_ok=True)

# Define verified status values (fix column names)
verified_statuses = ['Verified=Verified', 'Verified=Not Verified']

print("\nüîç Generating Discrimination Rules Based on Verified_Status\n")

for status_tag in verified_statuses:
    print(f"\nüìÅ Rules for '{status_tag}':")
    
    # Only include rules where that status is the only antecedent
    status_rules = rules[rules['antecedents'].apply(
        lambda x: status_tag in x and len(x) == 1
    )].sort_values(by='lift', ascending=False)
    
    if not status_rules.empty:
        for _, row in status_rules.iterrows():
            antecedent = ' AND '.join(list(row['antecedents']))
            consequent = ' AND '.join(list(row['consequents']))
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}\n")

        # Save to CSV
        summary_df = status_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
        summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
        summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))
        filename = status_tag.replace("=", "_") + "_rules_summary.csv"
        summary_df.to_csv(f"verified_status_rules/{filename}", index=False)
    else:
        print(f"‚ùå No rules generated for status '{status_tag}'.")



üîç Generating Discrimination Rules Based on Verified_Status


üìÅ Rules for 'Verified=Verified':
üëâ IF Verified=Verified THEN Helpfulness=More Helpful
   - Support: 0.154
   - Confidence: 0.3
   - Lift: 1.123

üëâ IF Verified=Verified THEN Helpfulness=More Helpful AND Reviewer_Type=Infrequent
   - Support: 0.103
   - Confidence: 0.201
   - Lift: 1.114

üëâ IF Verified=Verified THEN Reviewer_Type=Infrequent AND Sentiment=Neutral AND Rating=Medium
   - Support: 0.103
   - Confidence: 0.201
   - Lift: 1.084

üëâ IF Verified=Verified THEN Reviewer_Type=Infrequent AND Sentiment=Neutral
   - Support: 0.103
   - Confidence: 0.201
   - Lift: 1.084

üëâ IF Verified=Verified THEN Reviewer_Type=Infrequent AND Rating=Medium
   - Support: 0.103
   - Confidence: 0.201
   - Lift: 1.084

üëâ IF Verified=Verified THEN Reviewer_Type=Infrequent AND Review_Length=Medium
   - Support: 0.107
   - Confidence: 0.209
   - Lift: 1.048

üëâ IF Verified=Verified THEN Review_Length=Short
   - Support: 

In [70]:
df_trans

Unnamed: 0,Fashion_Category=Accessories,Fashion_Category=Activewear,Fashion_Category=Bottoms,Fashion_Category=Dresses,Fashion_Category=Ethnic Wear,Fashion_Category=Footwear,Fashion_Category=Kidswear,Fashion_Category=Lingerie & Sleepwear,Fashion_Category=Menswear,Fashion_Category=Outerwear,...,Review_Length=Long,Review_Length=Medium,Review_Length=Short,Reviewer_Type=Frequent,Reviewer_Type=Infrequent,Sentiment=Negative,Sentiment=Neutral,Sentiment=Positive,Verified=Not Verified,Verified=Verified
0,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,True,True,False
1,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,True,False,True
2,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,True,True,False
3,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,True,False,True
4,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,True,False,False,True
1004,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,True,False,False,True
1005,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,True,False,False,True
1006,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,True,False,True,False


In [71]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import os

# Run Apriori (adjust thresholds as needed)
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Create output directory
os.makedirs("review_length_rules", exist_ok=True)

# Define review length categories
review_lengths = ['Review_Length=Short', 'Review_Length=Long']

print("\nüìù Generating Discrimination Rules Based on Review_Length\n")

for length_tag in review_lengths:
    print(f"\nüìÅ Rules for '{length_tag}':")
    
    # Filter rules where the only antecedent is review length
    length_rules = rules[rules['antecedents'].apply(
        lambda x: length_tag in x and len(x) == 1
    )].sort_values(by='lift', ascending=False)
    
    if not length_rules.empty:
        for _, row in length_rules.iterrows():
            antecedent = ' AND '.join(list(row['antecedents']))
            consequent = ' AND '.join(list(row['consequents']))
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}\n")

        # Save to CSV
        summary_df = length_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
        summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
        summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))
        filename = length_tag.replace("=", "_") + "_rules_summary.csv"
        summary_df.to_csv(f"review_length_rules/{filename}", index=False)
    else:
        print(f"‚ùå No rules generated for review length '{length_tag}'.")



üìù Generating Discrimination Rules Based on Review_Length


üìÅ Rules for 'Review_Length=Short':
üëâ IF Review_Length=Short THEN Reviewer_Type=Frequent AND Verified=Verified
   - Support: 0.083
   - Confidence: 0.226
   - Lift: 1.155

üëâ IF Review_Length=Short THEN Reviewer_Type=Frequent
   - Support: 0.155
   - Confidence: 0.419
   - Lift: 1.044

üëâ IF Review_Length=Short THEN Verified=Verified
   - Support: 0.196
   - Confidence: 0.532
   - Lift: 1.038

üëâ IF Review_Length=Short THEN Rating=Low AND Sentiment=Negative
   - Support: 0.116
   - Confidence: 0.315
   - Lift: 1.036

üëâ IF Review_Length=Short THEN Rating=Low
   - Support: 0.116
   - Confidence: 0.315
   - Lift: 1.036

üëâ IF Review_Length=Short THEN Sentiment=Negative
   - Support: 0.116
   - Confidence: 0.315
   - Lift: 1.036

üëâ IF Review_Length=Short THEN Helpfulness=None
   - Support: 0.141
   - Confidence: 0.382
   - Lift: 1.026

üëâ IF Review_Length=Short THEN Rating=Medium
   - Support: 0.112
   - Co

In [72]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import os

# Run Apriori (adjust thresholds as needed)
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Create output directory
os.makedirs("sentiment_rules", exist_ok=True)

# Define sentiment tags
sentiment_tags = ['Sentiment=Positive','Sentiment=Neutral', 'Sentiment=Negative']

print("\nüß† Generating Discrimination Rules Based on Sentiment\n")

for sentiment in sentiment_tags:
    print(f"\nüìÅ Rules for '{sentiment}':")
    
    # Filter rules where the only antecedent is the sentiment tag
    sentiment_rules = rules[rules['antecedents'].apply(
        lambda x: sentiment in x and len(x) == 1
    )].sort_values(by='lift', ascending=False)
    
    if not sentiment_rules.empty:
        for _, row in sentiment_rules.iterrows():
            antecedent = ' AND '.join(list(row['antecedents']))
            consequent = ' AND '.join(list(row['consequents']))
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}\n")

        # Save to CSV
        summary_df = sentiment_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
        summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
        summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))
        filename = sentiment.replace("=", "_") + "_rules_summary.csv"
        summary_df.to_csv(f"sentiment_rules/{filename}", index=False)
    else:
        print(f"‚ùå No rules generated for sentiment '{sentiment}'.")



üß† Generating Discrimination Rules Based on Sentiment


üìÅ Rules for 'Sentiment=Positive':
üëâ IF Sentiment=Positive THEN Helpfulness=None AND Rating=High
   - Support: 0.139
   - Confidence: 0.347
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Rating=High
   - Support: 0.4
   - Confidence: 1.0
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Reviewer_Type=Frequent AND Rating=High AND Verified=Verified
   - Support: 0.087
   - Confidence: 0.218
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Verified=Not Verified AND Reviewer_Type=Frequent AND Rating=High
   - Support: 0.095
   - Confidence: 0.238
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Reviewer_Type=Infrequent AND Rating=High
   - Support: 0.217
   - Confidence: 0.543
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Review_Length=Short AND Rating=High
   - Support: 0.141
   - Confidence: 0.352
   - Lift: 2.501

üëâ IF Sentiment=Positive THEN Review_Length=Long AND Rating=High
   - Support: 0.123
   - Confidence

In [73]:
# Step 1: Create subsets for each reviewer type
frequent_df = df_trans[df_trans['Reviewer_Type=Frequent'] == True]
infrequent_df = df_trans[df_trans['Reviewer_Type=Infrequent'] == True]

def compute_t_d_weight(rules_df, target_df, contrast_df, target_label):
    t_weights = []
    d_weights = []

    # Calculate total support of target class
    total_target = len(target_df)

    for _, row in rules_df.iterrows():
        antecedent_items = list(row['antecedents'])

        # Find rows in which all items of antecedent are True
        antecedent_support_target = target_df[antecedent_items].all(axis=1).sum() / len(df_trans)
        antecedent_support_contrast = contrast_df[antecedent_items].all(axis=1).sum() / len(df_trans)

        t_weight = antecedent_support_target / (target_df['Reviewer_Type=' + target_label].sum() / len(df_trans))
        d_weight = antecedent_support_target - antecedent_support_contrast

        t_weights.append(round(t_weight, 3))
        d_weights.append(round(d_weight, 3))

    rules_df['T-Weight'] = t_weights
    rules_df['D-Weight'] = d_weights

    return rules_df

# Compute T-Weight and D-Weight for each group
frequent_reviewer_rules = compute_t_d_weight(frequent_reviewer_rules, frequent_df, infrequent_df, "Frequent")
infrequent_reviewer_rules = compute_t_d_weight(infrequent_reviewer_rules, infrequent_df, frequent_df, "Infrequent")


In [74]:
frequent_summary_df = frequent_reviewer_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'T-Weight', 'D-Weight']].copy()
frequent_summary_df['antecedents'] = frequent_summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
frequent_summary_df['consequents'] = frequent_summary_df['consequents'].apply(lambda x: ' AND '.join(x))

infrequent_summary_df = infrequent_reviewer_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'T-Weight', 'D-Weight']].copy()
infrequent_summary_df['antecedents'] = infrequent_summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
infrequent_summary_df['consequents'] = infrequent_summary_df['consequents'].apply(lambda x: ' AND '.join(x))


In [75]:
import pandas as pd

# === Step A: Function to calculate T-Weight and D-Weight ===
def add_t_d_weights(rules_df):
    rules_df = rules_df.copy()

    # Convert frozensets to strings for easier grouping
    rules_df['antecedents_str'] = rules_df['antecedents'].apply(lambda x: ' AND '.join(sorted(list(x))))
    rules_df['consequents_str'] = rules_df['consequents'].apply(lambda x: ' AND '.join(sorted(list(x))))

    # T-Weight: grouped by consequent
    t_weight_map = rules_df.groupby('consequents_str')['support'].transform('sum')
    rules_df['T-Weight'] = rules_df['support'] / t_weight_map

    # D-Weight: grouped by antecedent
    d_weight_map = rules_df.groupby('antecedents_str')['support'].transform('sum')
    rules_df['D-Weight'] = rules_df['support'] / d_weight_map

    return rules_df

# Apply weights to both groups
frequent_reviewer_rules = add_t_d_weights(frequent_reviewer_rules)
infrequent_reviewer_rules = add_t_d_weights(infrequent_reviewer_rules)

# === Step B: Display Rules with Weights ===
def display_rules_with_weights(rules_df, label):
    print(f"\nüìã {label}:\n")
    if not rules_df.empty:
        for _, row in rules_df.iterrows():
            antecedent = row['antecedents_str']
            consequent = row['consequents_str']
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"Support: {row['support']:.5f}")  # Adjust precision as necessary, or just use {row['support']}
            print(f"Confidence: {row['confidence']}")
            print(f"Lift: {row['lift']}")
            print(f"T-Weight: {row['T-Weight']}")
            print(f"D-Weight: {row['D-Weight']}\n")

    else:
        print("No rules found.")

# Display rules with weights
display_rules_with_weights(frequent_reviewer_rules, "Reviewer Type = Frequent (with T & D Weights)")
display_rules_with_weights(infrequent_reviewer_rules, "Reviewer Type = Infrequent (with T & D Weights)")

# === Step C: Exporting to CSV with weights ===
frequent_reviewer_rules.to_csv('reviewer_type_frequent_rules_with_weights.csv', index=False)
infrequent_reviewer_rules.to_csv('reviewer_type_infrequent_rules_with_weights.csv', index=False)

print("\n‚úÖ Rules with T-Weight and D-Weight saved to CSV.")



üìã Reviewer Type = Frequent (with T & D Weights):

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Sentiment=Positive AND Verified=Not Verified
Support: 0.09524
Confidence: 0.23703703703703702
Lift: 1.1828382838283826
T-Weight: 1.0
D-Weight: 0.030389363722697058

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Verified=Not Verified
Support: 0.09524
Confidence: 0.23703703703703702
Lift: 1.1828382838283826
T-Weight: 1.0
D-Weight: 0.030389363722697058

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Verified=Not Verified
Support: 0.09524
Confidence: 0.23703703703703702
Lift: 1.1828382838283826
T-Weight: 1.0
D-Weight: 0.030389363722697058

üëâ IF Reviewer_Type=Frequent THEN Helpfulness=None AND Verified=Not Verified
Support: 0.09226
Confidence: 0.22962962962962963
Lift: 1.169023569023569
T-Weight: 1.0
D-Weight: 0.029439696106362778

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive
Support: 0.18254
Confidence: 0.45432098765432094
Lift: 1.1363661428177556
T-Weigh

In [76]:
import pandas as pd

# === Step A: Function to calculate T-Weight and D-Weight ===
def compute_t_d_weight(rules_df, target_df, contrast_df, target_label):
    t_weights = []
    d_weights = []

    # Calculate total support of target class
    total_target = len(target_df)

    for _, row in rules_df.iterrows():
        antecedent_items = list(row['antecedents'])

        # Find rows in which all items of antecedent are True
        antecedent_support_target = target_df[antecedent_items].all(axis=1).sum() / len(df_trans)
        antecedent_support_contrast = contrast_df[antecedent_items].all(axis=1).sum() / len(df_trans)

        t_weight = antecedent_support_target / (target_df['Reviewer_Type=' + target_label].sum() / len(df_trans))
        d_weight = antecedent_support_target - antecedent_support_contrast

        t_weights.append(round(t_weight, 3))
        d_weights.append(round(d_weight, 3))

    rules_df['T-Weight'] = t_weights
    rules_df['D-Weight'] = d_weights

    return rules_df

# === Step B: Add T and D weights based on antecedent and consequent ===
def add_t_d_weights(rules_df):
    rules_df = rules_df.copy()

    # Convert frozensets to strings for easier grouping
    rules_df['antecedents_str'] = rules_df['antecedents'].apply(lambda x: ' AND '.join(sorted(list(x))))
    rules_df['consequents_str'] = rules_df['consequents'].apply(lambda x: ' AND '.join(sorted(list(x))))

    # T-Weight: grouped by consequent
    t_weight_map = rules_df.groupby('consequents_str')['support'].transform('sum')
    rules_df['T-Weight'] = rules_df['support'] / t_weight_map

    # D-Weight: grouped by antecedent
    d_weight_map = rules_df.groupby('antecedents_str')['support'].transform('sum')
    rules_df['D-Weight'] = rules_df['support'] / d_weight_map

    return rules_df

# === Step C: Display Rules with Weights ===
def display_rules_with_weights(rules_df, label):
    print(f"\nüìã {label}:\n")
    if not rules_df.empty:
        for _, row in rules_df.iterrows():
            antecedent = row['antecedents_str']
            consequent = row['consequents_str']
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}")
            print(f"   - T-Weight: {round(row['T-Weight'], 3)}")
            print(f"   - D-Weight: {round(row['D-Weight'], 3)}\n")
    else:
        print("No rules found.")

# === Step D: Exporting to CSV with weights ===
def export_to_csv(frequent_reviewer_rules, infrequent_reviewer_rules):
    frequent_reviewer_rules.to_csv('reviewer_type_frequent_rules_with_weights.csv', index=False)
    infrequent_reviewer_rules.to_csv('reviewer_type_infrequent_rules_with_weights.csv', index=False)
    print("\n‚úÖ Rules with T-Weight and D-Weight saved to CSV.")

# === Step E: Process the data ===
# Step 1: Create subsets for each reviewer type
frequent_df = df_trans[df_trans['Reviewer_Type=Frequent'] == True]
infrequent_df = df_trans[df_trans['Reviewer_Type=Infrequent'] == True]

# Apply the T and D weights functions to both groups
frequent_reviewer_rules = add_t_d_weights(frequent_reviewer_rules)
infrequent_reviewer_rules = add_t_d_weights(infrequent_reviewer_rules)

# Display the results with weights
display_rules_with_weights(frequent_reviewer_rules, "Reviewer Type = Frequent (with T & D Weights)")
display_rules_with_weights(infrequent_reviewer_rules, "Reviewer Type = Infrequent (with T & D Weights)")

# Save the output
export_to_csv(frequent_reviewer_rules, infrequent_reviewer_rules)



üìã Reviewer Type = Frequent (with T & D Weights):

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Sentiment=Positive AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight: 1.0
   - D-Weight: 0.03

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight: 1.0
   - D-Weight: 0.03

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight: 1.0
   - D-Weight: 0.03

üëâ IF Reviewer_Type=Frequent THEN Helpfulness=None AND Verified=Not Verified
   - Support: 0.092
   - Confidence: 0.23
   - Lift: 1.169
   - T-Weight: 1.0
   - D-Weight: 0.029

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive
   - Support: 0.183
   - Confidence: 0.454
   - Lift: 1.136
   - T-Weight: 1.0
   - D-Weight: 0.058

üëâ IF Reviewer_Type=Frequent THEN Rating=High
   - Suppor

In [77]:
from tabulate import tabulate

# === Step A: Function to calculate T-Weight and D-Weight ===
def add_t_d_weights(rules_df):
    rules_df = rules_df.copy()

    # Convert frozensets to strings for easier grouping
    rules_df['antecedents_str'] = rules_df['antecedents'].apply(lambda x: ' AND '.join(sorted(list(x))))
    rules_df['consequents_str'] = rules_df['consequents'].apply(lambda x: ' AND '.join(sorted(list(x))))

    # T-Weight: grouped by consequent
    t_weight_map = rules_df.groupby('consequents_str')['support'].transform('sum')
    rules_df['T-Weight'] = rules_df['support'] / t_weight_map

    # D-Weight: grouped by antecedent
    d_weight_map = rules_df.groupby('antecedents_str')['support'].transform('sum')
    rules_df['D-Weight'] = rules_df['support'] / d_weight_map

    # Adding Percentage Columns for T-Weight and D-Weight
    total_support = rules_df['support'].sum()
    rules_df['T-Weight (%)'] = (rules_df['T-Weight'] / total_support) * 100
    rules_df['D-Weight (%)'] = (rules_df['D-Weight'] / total_support) * 100

    return rules_df

# Apply weights to both groups
frequent_reviewer_rules = add_t_d_weights(frequent_reviewer_rules)
infrequent_reviewer_rules = add_t_d_weights(infrequent_reviewer_rules)

# === Step B: Tabulate Rules with Weights ===
def tabulate_rules_with_weights(rules_df, label):
    print(f"\nüìã {label}:\n")
    if not rules_df.empty:
        # Prepare table headers
        headers = ["Antecedents", "Consequents", "Support", "Confidence", "Lift", "T-Weight (%)", "D-Weight (%)"]
        
        # Prepare the rows for the table
        table_data = []
        for _, row in rules_df.iterrows():
            antecedent = row['antecedents_str']
            consequent = row['consequents_str']
            table_data.append([antecedent, consequent, round(row['support'], 3), 
                               round(row['confidence'], 3), round(row['lift'], 3),
                               round(row['T-Weight (%)'], 2), round(row['D-Weight (%)'], 2)])
        
        # Display the table
        print(tabulate(table_data, headers=headers, tablefmt="grid"))
    else:
        print("No rules found.")

# Display rules with weights in tabular format
tabulate_rules_with_weights(frequent_reviewer_rules, "Reviewer Type = Frequent (with T & D Weights)")
tabulate_rules_with_weights(infrequent_reviewer_rules, "Reviewer Type = Infrequent (with T & D Weights)")

# === Step C: Exporting to CSV with weights ===
frequent_reviewer_rules.to_csv('reviewer_type_frequent_rules_with_weights.csv', index=False)
infrequent_reviewer_rules.to_csv('reviewer_type_infrequent_rules_with_weights.csv', index=False)

print("\n‚úÖ Rules with T-Weight and D-Weight saved to CSV.")



üìã Reviewer Type = Frequent (with T & D Weights):

+------------------------+--------------------------------------------------------------+-----------+--------------+--------+----------------+----------------+
| Antecedents            | Consequents                                                  |   Support |   Confidence |   Lift |   T-Weight (%) |   D-Weight (%) |
| Reviewer_Type=Frequent | Rating=High AND Sentiment=Positive AND Verified=Not Verified |     0.095 |        0.237 |  1.183 |          31.91 |           0.97 |
+------------------------+--------------------------------------------------------------+-----------+--------------+--------+----------------+----------------+
| Reviewer_Type=Frequent | Sentiment=Positive AND Verified=Not Verified                 |     0.095 |        0.237 |  1.183 |          31.91 |           0.97 |
+------------------------+--------------------------------------------------------------+-----------+--------------+--------+----------------+----

In [78]:
print(df_trans.columns)


Index(['Fashion_Category=Accessories', 'Fashion_Category=Activewear',
       'Fashion_Category=Bottoms', 'Fashion_Category=Dresses',
       'Fashion_Category=Ethnic Wear', 'Fashion_Category=Footwear',
       'Fashion_Category=Kidswear', 'Fashion_Category=Lingerie & Sleepwear',
       'Fashion_Category=Menswear', 'Fashion_Category=Outerwear',
       'Fashion_Category=Tops & Tees', 'Fashion_Category=Unknown',
       'Fashion_Category=Womenswear', 'Helpfulness=Less Helpful',
       'Helpfulness=More Helpful', 'Helpfulness=None', 'Rating=High',
       'Rating=Low', 'Rating=Medium', 'Review_Length=Long',
       'Review_Length=Medium', 'Review_Length=Short', 'Reviewer_Type=Frequent',
       'Reviewer_Type=Infrequent', 'Sentiment=Negative', 'Sentiment=Neutral',
       'Sentiment=Positive', 'Verified=Not Verified', 'Verified=Verified'],
      dtype='object')


In [80]:
def calculate_weights(rules_df):
    rules_df = rules_df.copy()
    rules_df['antecedents_str'] = rules_df['antecedents'].apply(lambda x: ' AND '.join(sorted(x)))
    rules_df['consequents_str'] = rules_df['consequents'].apply(lambda x: ' AND '.join(sorted(x)))

    t_total = rules_df.groupby('consequents_str')['support'].transform('sum')
    d_total = rules_df.groupby('antecedents_str')['support'].transform('sum')

    rules_df['T-Weight'] = rules_df['support'] / t_total
    rules_df['D-Weight'] = rules_df['support'] / d_total

    return rules_df


In [89]:
import pandas as pd

# === Step A: Function to calculate T-Weight and D-Weight ===
def add_t_d_weights(rules_df):
    rules_df = rules_df.copy()

    # Convert frozensets to strings for grouping
    rules_df['antecedents_str'] = rules_df['antecedents'].apply(lambda x: ' AND '.join(sorted(list(x))))
    rules_df['consequents_str'] = rules_df['consequents'].apply(lambda x: ' AND '.join(sorted(list(x))))

    # T-Weight: support of the rule / total support for that consequent group
    rules_df['T-Weight (%)'] = rules_df.groupby('consequents_str')['support'].transform(
        lambda x: 100 * x / x.sum()
    )

    # D-Weight: support of the rule / total support for that antecedent group
    rules_df['D-Weight (%)'] = rules_df.groupby('antecedents_str')['support'].transform(
        lambda x: 100 * x / x.sum()
    )

    return rules_df

# Apply weights to both groups
frequent_reviewer_rules = add_t_d_weights(frequent_reviewer_rules)
infrequent_reviewer_rules = add_t_d_weights(infrequent_reviewer_rules)

# === Step B: Display Rules with Weights ===
def display_rules_with_weights(rules_df, label):
    print(f"\nüìã {label}:\n")
    if not rules_df.empty:
        for _, row in rules_df.iterrows():
            antecedent = row['antecedents_str']
            consequent = row['consequents_str']
            print(f"üëâ IF {antecedent} THEN {consequent}")
            print(f"   - Support: {round(row['support'], 3)}")
            print(f"   - Confidence: {round(row['confidence'], 3)}")
            print(f"   - Lift: {round(row['lift'], 3)}")
            print(f"   - T-Weight:  {round(row['T-Weight (%)'], 2)}%")
            print(f"   - D-Weight:  {round(row['D-Weight (%)'], 2)}%\n")
    else:
        print("No rules found.")

# Display rules with weights
display_rules_with_weights(frequent_reviewer_rules, "Reviewer Type = Frequent (with T & D Weights)")
display_rules_with_weights(infrequent_reviewer_rules, "Reviewer Type = Infrequent (with T & D Weights)")

# === Step C: Exporting to CSV with weights ===
frequent_reviewer_rules.to_csv('reviewer_type_frequent_rules_with_weights.csv', index=False)
infrequent_reviewer_rules.to_csv('reviewer_type_infrequent_rules_with_weights.csv', index=False)

print("\n‚úÖ Rules with T-Weight and D-Weight saved to CSV.")


üìã Reviewer Type = Frequent (with T & D Weights):

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Sentiment=Positive AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight:  100.0%
   - D-Weight:  3.04%

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight:  100.0%
   - D-Weight:  3.04%

üëâ IF Reviewer_Type=Frequent THEN Rating=High AND Verified=Not Verified
   - Support: 0.095
   - Confidence: 0.237
   - Lift: 1.183
   - T-Weight:  100.0%
   - D-Weight:  3.04%

üëâ IF Reviewer_Type=Frequent THEN Helpfulness=None AND Verified=Not Verified
   - Support: 0.092
   - Confidence: 0.23
   - Lift: 1.169
   - T-Weight:  100.0%
   - D-Weight:  2.94%

üëâ IF Reviewer_Type=Frequent THEN Sentiment=Positive
   - Support: 0.183
   - Confidence: 0.454
   - Lift: 1.136
   - T-Weight:  100.0%
   - D-Weight:  5.82%

üëâ IF Reviewer_Type=Frequent 

In [88]:
import pandas as pd

# ---------------------------
# Helper function to calculate T and D weights
# ---------------------------
def calculate_weights_percentage(rules_df, category_column):
    def compute_t_weight(row):
        return row['support'] * 100 if category_column in str(row['antecedents']) else 0

    def compute_d_weight(row, max_lift):
        return (row['lift'] / max_lift) * 100 if category_column in str(row['antecedents']) else 0

    # Compute T Weight normally
    rules_df['T Weight (%)'] = rules_df.apply(compute_t_weight, axis=1)
    
    # Normalize D Weight using the maximum lift
    max_lift = rules_df[rules_df['antecedents'].astype(str).str.contains(category_column)]['lift'].max()
    rules_df['D Weight (%)'] = rules_df.apply(lambda row: compute_d_weight(row, max_lift), axis=1)

    return rules_df


# ---------------------------
# Main loop to process each fashion category
# ---------------------------
for category in fashion_categories:
    cat_tag = f"Fashion_Category={category}"

    if cat_tag in df_trans.columns:
        # Strict filter: only that category alone in the antecedent
        cat_rules = rules[rules['antecedents'].apply(
            lambda x: cat_tag in str(x) and len(x) == 1
        )].sort_values(by='lift', ascending=False)

        print(f"\nüìÇ Rules for '{cat_tag}':")
        if not cat_rules.empty:
            # ‚úÖ Add T/D Weights as percentage
            cat_rules = calculate_weights_percentage(cat_rules, cat_tag)

            for _, row in cat_rules.iterrows():
                antecedent = ' AND '.join(list(row['antecedents']))
                consequent = ' AND '.join(list(row['consequents']))
                print(f"üëâ IF {antecedent} THEN {consequent}")
                print(f"   - Support: {round(row['support'], 3)}")
                print(f"   - Confidence: {round(row['confidence'], 3)}")
                print(f"   - Lift: {round(row['lift'], 3)}")
                print(f"   - T Weight (Typicality): {row['T Weight (%)']:.2f}%")
                print(f"   - D Weight (Discriminability): {row['D Weight (%)']:.2f}%")

            # ‚úÖ Save summary to CSV
            summary_df = cat_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift',
                                    'T Weight (%)', 'D Weight (%)']].copy()
            summary_df['antecedents'] = summary_df['antecedents'].apply(lambda x: ' AND '.join(x))
            summary_df['consequents'] = summary_df['consequents'].apply(lambda x: ' AND '.join(x))
            summary_df.to_csv(f"fashion_category_rules/{category}_category_rules_summary.csv", index=False)
        else:
            print(f"‚ùå No rules generated for category '{category}'.")
    else:
        print(f"‚ùå Column '{cat_tag}' not found in the DataFrame.")



üìÇ Rules for 'Fashion_Category=Accessories':
üëâ IF Fashion_Category=Accessories THEN Reviewer_Type=Frequent AND Verified=Verified
   - Support: 0.023
   - Confidence: 0.235
   - Lift: 1.201
   - T Weight (Typicality): 2.28%
   - D Weight (Discriminability): 100.00%
üëâ IF Fashion_Category=Accessories THEN Reviewer_Type=Frequent
   - Support: 0.045
   - Confidence: 0.459
   - Lift: 1.143
   - T Weight (Typicality): 4.46%
   - D Weight (Discriminability): 95.17%
üëâ IF Fashion_Category=Accessories THEN Sentiment=Negative
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109
   - T Weight (Typicality): 3.27%
   - D Weight (Discriminability): 92.37%
üëâ IF Fashion_Category=Accessories THEN Rating=Low
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109
   - T Weight (Typicality): 3.27%
   - D Weight (Discriminability): 92.37%
üëâ IF Fashion_Category=Accessories THEN Rating=Low AND Sentiment=Negative
   - Support: 0.033
   - Confidence: 0.337
   - Lift: 1.109
   - T We