<a href="https://colab.research.google.com/github/JeyScientist/Artificial-Intelligence/blob/main/Association.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Step 1: Read the data (assuming semicolon-separated values)
df = pd.read_csv('single.csv', sep=';')

# Step 2: Clean column names to handle spaces or special characters
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)  # Remove special characters

# Step 3: Print the cleaned column names to check if 'Country' exists
print("Cleaned Column Names:")
print(df.columns)

# Step 4: Handle "wrongly coded" entries in 'Itemname'
df['Itemname'] = df['Itemname'].replace(r'wrongly coded.*', 'Unknown Item', regex=True)

# Step 5: Drop columns with completely empty values
df = df.dropna(axis=1, how='all')

# Step 6: Remove columns that start with 'Unnamed' (e.g., 'Country,,,')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Step 7: Clean 'Price' column: Replace commas with dots and convert to numeric
df['Price'] = df['Price'].replace(',', '.', regex=True)  # Replace commas with dots
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')  # Convert to numeric, coerce errors

# Step 8: Clean 'Quantity' column: Convert to numeric
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')

# Step 9: Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Step 10: Handle missing values in 'CustomerID' and 'Country'
# Ensure 'CustomerID' and 'Country' are available in cleaned columns
if 'CustomerID' in df.columns:
    df['CustomerID'] = df['CustomerID'].fillna('Unknown')  # Fill missing CustomerID with 'Unknown'
else:
    print("'CustomerID' column is missing!")

if 'Country' in df.columns:
    df['Country'] = df['Country'].fillna('Unknown')  # Fill missing Country with 'Unknown'
else:
    print("'Country' column is missing!")

# Step 11: Show a snippet of the cleaned dataframe
print("Cleaned DataFrame:")
print(df.head())

# Step 12: Check for any remaining "Unknown Item" entries (i.e., wrongly coded items)
print("\nRows with 'Unknown Item':")
print(df[df['Itemname'].str.contains('Unknown Item', na=False)])

# Optional: Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_data.csv', index=False)

Cleaned Column Names:
Index(['BillNo', 'Itemname', 'Quantity', 'Date', 'Price', 'CustomerID',
       'Country'],
      dtype='object')
Cleaned DataFrame:
   BillNo                             Itemname  Quantity                Date  \
0  536365   WHITE HANGING HEART T-LIGHT HOLDER       6.0 2010-01-12 08:26:00   
1  536365                  WHITE METAL LANTERN       6.0 2010-01-12 08:26:00   
2  536365       CREAM CUPID HEARTS COAT HANGER       8.0 2010-01-12 08:26:00   
3  536365  KNITTED UNION FLAG HOT WATER BOTTLE       6.0 2010-01-12 08:26:00   
4  536365       RED WOOLLY HOTTIE WHITE HEART.       6.0 2010-01-12 08:26:00   

   Price CustomerID           Country  
0   2.55    17850.0  United Kingdom,,  
1   3.39    17850.0  United Kingdom,,  
2   2.75    17850.0  United Kingdom,,  
3   3.39    17850.0  United Kingdom,,  
4   3.39    17850.0  United Kingdom,,  

Rows with 'Unknown Item':
        BillNo      Itemname  Quantity                Date  Price CustomerID  \
366291  569830  Un

In [None]:
import pandas as pd

def analyze_data_for_association_rules(df):
    suggestions = {}

    # Basic stats
    num_rows, num_cols = df.shape
    suggestions['rows'] = num_rows
    suggestions['columns'] = num_cols

    # Detect column types
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    suggestions['categorical_features'] = categorical_cols
    suggestions['numerical_features'] = numerical_cols

    # Determine dimensions and rule types
    rule_types = []
    explanations = []

    if num_cols == 1 and categorical_cols:
        rule_types.append("Single-dimensional Association Rules")
        explanations.append("Only one categorical column detected. Rules can only be mined from within that column (e.g., frequent values in 'Itemname').")

    if len(categorical_cols) > 1:
        rule_types.append("Multi-dimensional Association Rules")
        explanations.append("Multiple categorical columns found. Enables rules across dimensions (e.g., 'If Country is France and Customer is X, then likely buys Y').")

    if numerical_cols:
        rule_types.append("Quantitative Association Rules")
        explanations.append("Numerical features detected. Enables mining based on numerical ranges (e.g., 'If Quantity > 10, then Price > 20').")

    # Detect boolean-like columns
    boolean_like_cols = []
    for col in categorical_cols + numerical_cols:
        unique_vals = df[col].dropna().unique()
        if set(unique_vals).issubset({0, 1, True, False}):
            boolean_like_cols.append(col)

    if boolean_like_cols:
        rule_types.append("Boolean Association Rules")
        explanations.append(f"Boolean-like columns found: {boolean_like_cols}. You can analyze co-occurrence or absence-based rules.")

    # Check for negative association candidates
    negative_rule_candidates = []
    if len(categorical_cols) >= 2:
        for i, col1 in enumerate(categorical_cols):
            for col2 in categorical_cols[i + 1:]:
                cross_tab = pd.crosstab(df[col1], df[col2])
                if (cross_tab == 0).sum().sum() > 0:
                    negative_rule_candidates.append((col1, col2))

    if negative_rule_candidates:
        rule_types.append("Potential Negative Association Rules")
        explanations.append("Certain column pairs never or rarely occur together. May suggest exclusivity or negative relationships.")

    # Build suggestions with detail
    suggestions['recommended_rule_types'] = {
        "types": rule_types,
        "explanations": explanations
    }
    suggestions['boolean_like_columns'] = boolean_like_cols
    suggestions['negative_rule_candidates'] = negative_rule_candidates

    return suggestions


# Example usage with your dataframe `df`
results = analyze_data_for_association_rules(df)

# Custom detailed printout
print("📊 DATASET SUMMARY")
print(f"- Rows: {results['rows']}")
print(f"- Columns: {results['columns']}")
print(f"- Categorical Features ({len(results['categorical_features'])}): {results['categorical_features']}")
print(f"- Numerical Features ({len(results['numerical_features'])}): {results['numerical_features']}\n")

print("🧠 RECOMMENDED ASSOCIATION RULE TYPES")
for rule, explanation in zip(results['recommended_rule_types']['types'], results['recommended_rule_types']['explanations']):
    print(f"✅ {rule}: {explanation}")

if results['boolean_like_columns']:
    print(f"\n📌 Boolean-like Columns: {results['boolean_like_columns']}")

if results['negative_rule_candidates']:
    print("\n⚠️ Potential Negative Associations Found Between:")
    for pair in results['negative_rule_candidates']:
        print(f"   - {pair[0]} ⛔ {pair[1]}")

📊 DATASET SUMMARY
- Rows: 522064
- Columns: 7
- Categorical Features (4): ['BillNo', 'Itemname', 'CustomerID', 'Country']
- Numerical Features (2): ['Quantity', 'Price']

🧠 RECOMMENDED ASSOCIATION RULE TYPES
✅ Multi-dimensional Association Rules: Multiple categorical columns found. Enables rules across dimensions (e.g., 'If Country is France and Customer is X, then likely buys Y').
✅ Quantitative Association Rules: Numerical features detected. Enables mining based on numerical ranges (e.g., 'If Quantity > 10, then Price > 20').
✅ Potential Negative Association Rules: Certain column pairs never or rarely occur together. May suggest exclusivity or negative relationships.

⚠️ Potential Negative Associations Found Between:
   - BillNo ⛔ Itemname
   - BillNo ⛔ CustomerID
   - BillNo ⛔ Country
   - Itemname ⛔ CustomerID
   - Itemname ⛔ Country
   - CustomerID ⛔ Country


In [None]:
!pip install mlxtend matplotlib seaborn

In [None]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from pyECLAT import ECLAT
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from colorama import Fore, Style, init

init(autoreset=True)

# ------------------------ Sample Transactions
transactions = [
    ['milk', 'bread', 'eggs'],
    ['milk', 'bread'],
    ['milk', 'cookies'],
    ['bread', 'butter'],
    ['milk', 'bread', 'butter', 'eggs'],
    ['cookies', 'soda'],
    ['milk', 'soda'],
]

# ------------------------ Encode Transactions
te = TransactionEncoder()
df = pd.DataFrame(te.fit_transform(transactions), columns=te.columns_)

# ------------------------ Grid Search Parameters
support_range = [0.2, 0.3, 0.4]
confidence_range = [0.5, 0.6, 0.7]
grid_results = []

# ------------------------ Apriori & FP-Growth Grid Search
for algo_name, algo_func in [('Apriori', apriori), ('FP-Growth', fpgrowth)]:
    for support, confidence in product(support_range, confidence_range):
        start = time.time()
        frequent_items = algo_func(df, min_support=support, use_colnames=True)
        rules = association_rules(frequent_items, metric="confidence", min_threshold=confidence)
        elapsed = time.time() - start
        if not rules.empty:
            avg_lift = rules['lift'].mean()
            avg_conf = rules['confidence'].mean()
        else:
            avg_lift = avg_conf = 0
        score = len(rules) * avg_lift * avg_conf
        grid_results.append({
            'Algorithm': algo_name,
            'Min Support': support,
            'Min Confidence': confidence,
            'Rules': len(rules),
            'Avg Lift': round(avg_lift, 3),
            'Avg Confidence': round(avg_conf, 3),
            'Time (s)': round(elapsed, 4),
            'Score': round(score, 3)
        })

# ------------------------ Summary DataFrame
grid_df = pd.DataFrame(grid_results)
print(Fore.CYAN + "\n📊 Top 5 Rule-Generating Configurations:")
print(grid_df.sort_values('Score', ascending=False).head(5))

# ------------------------ Select Best Params
best_params = grid_df.sort_values('Score', ascending=False).iloc[0]
best_support = best_params['Min Support']
best_conf = best_params['Min Confidence']

print(Fore.GREEN + "\n✅ Best Parameter Combination Selected:")
print(f"🔹 Algorithm: {best_params['Algorithm']}")
print(f"🔹 Min Support: {best_support}")
print(f"🔹 Min Confidence: {best_conf}")
print(f"🔹 Rules Found: {int(best_params['Rules'])}")
print(f"🔹 Avg Lift: {round(best_params['Avg Lift'], 3)}")
print(f"🔹 Avg Confidence: {round(best_params['Avg Confidence'], 3)}")

# ------------------------ Generate Rules
rules_apriori = association_rules(apriori(df, min_support=best_support, use_colnames=True),
                                  metric="confidence", min_threshold=best_conf)
rules_apriori['Algorithm'] = 'Apriori'

rules_fp = association_rules(fpgrowth(df, min_support=best_support, use_colnames=True),
                             metric="confidence", min_threshold=best_conf)
rules_fp['Algorithm'] = 'FP-Growth'

# ------------------------ ECLAT
eclat_model = ECLAT(data=pd.DataFrame({'Transactions': transactions}), verbose=False)
rule_eclat = eclat_model.fit(min_support=best_support)
rules_eclat = pd.DataFrame(rule_eclat['rule_support'].items(), columns=['itemsets', 'support'])
rules_eclat['itemsets'] = rules_eclat['itemsets'].apply(lambda x: frozenset(x))
rules_eclat = rules_eclat[rules_eclat['itemsets'].apply(lambda x: len(x) >= 2)]
rules_eclat['confidence'] = None
rules_eclat['lift'] = None
rules_eclat['antecedents'] = rules_eclat['itemsets'].apply(lambda x: frozenset(list(x)[:-1]))
rules_eclat['consequents'] = rules_eclat['itemsets'].apply(lambda x: frozenset([list(x)[-1]]))
rules_eclat['Algorithm'] = 'Eclat'
rules_eclat = rules_eclat[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Algorithm']]

# ------------------------ Combine Rules
combined_rules = pd.concat([rules_apriori, rules_fp, rules_eclat], ignore_index=True)

# ------------------------ Low-Quality Rule Warning
low_quality = combined_rules[
    (combined_rules['lift'].fillna(0) < 1.0) |
    (combined_rules['confidence'].fillna(0) < 0.5)
]
if len(low_quality) > 0:
    print(Fore.YELLOW + f"\n⚠️ {len(low_quality)} low-quality rules found (lift < 1.0 or confidence < 0.5)")
else:
    print(Fore.GREEN + f"\n✅ All {len(combined_rules)} rules are strong.")

# ------------------------ Show Top 5 Strongest Rules
print(Fore.MAGENTA + "\n🔍 Top 5 High-Lift Rules:")
top_rules = combined_rules.dropna().sort_values(by='lift', ascending=False).head(5)
for _, row in top_rules.iterrows():
    ant, con = set(row['antecedents']), set(row['consequents'])
    print(f"• If {ant} → then {con} "
          f"(support={round(row['support'], 2)}, conf={round(row['confidence'], 2)}, lift={round(row['lift'], 2)})")

# ------------------------ Export CSVs
combined_rules.to_csv("association_rules.csv", index=False)
grid_df.to_csv("grid_search_results.csv", index=False)
print(Fore.CYAN + "\n📁 Exported: 'association_rules.csv' and 'grid_search_results.csv'")

# ------------------------ Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=combined_rules.dropna(subset=['support', 'confidence']),
    x='support',
    y='confidence',
    hue='Algorithm',
    size='lift',
    sizes=(20, 200),
    palette='Set2'
)
plt.title('📊 Association Rules: Support vs Confidence')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.grid(True)
plt.tight_layout()
plt.show()

# ------------------------ Item-Based Collaborative Filtering (Cosine Similarity)
def item_based_recommendation(item, user_item_matrix, cosine_sim, top_k=5):
    item_idx = user_item_matrix.columns.get_loc(item)
    item_similarities = cosine_sim[item_idx]

    # Get the indices of the top_k most similar items
    similar_items_idx = np.argsort(item_similarities)[::-1][1:top_k+1]
    similar_items = user_item_matrix.columns[similar_items_idx]

    print(Fore.BLUE + f"\n🎯 Recommendations for item: {item}")
    for i, similar_item in enumerate(similar_items):
        print(f"→ {similar_item} (cosine similarity={round(item_similarities[similar_items_idx[i]], 2)})")

# ------------------------ Prepare Data for Collaborative Filtering
user_item_matrix = pd.DataFrame(te.transform(transactions).toarray(), columns=te.columns_)
cosine_sim = cosine_similarity(user_item_matrix.T)

# ------------------------ Popularity-Based Recommendation
def popularity_based_recommendation(user_item_matrix, top_k=5):
    item_popularity = user_item_matrix.sum(axis=0)
    popular_items = item_popularity.sort_values(ascending=False).head(top_k)

    print(Fore.GREEN + "\n🎯 Popularity-Based Recommendations:")
    for item, count in popular_items.items():
        print(f"→ {item} (purchased {count} times)")

# ------------------------ Hybrid Recommendation (Item-Based CF + Popularity)
def hybrid_recommendation(item, user_item_matrix, cosine_sim, top_k=5, alpha=0.7):
    # Get recommendations using item-based CF
    item_similarities = cosine_sim[user_item_matrix.columns.get_loc(item)]
    similar_items_idx = np.argsort(item_similarities)[::-1][1:top_k+1]
    similar_items = user_item_matrix.columns[similar_items_idx]

    # Get popularity-based recommendations
    item_popularity = user_item_matrix.sum(axis=0)
    popular_items = item_popularity.sort_values(ascending=False).head(top_k)

    # Combine recommendations
    print(Fore.MAGENTA + f"\n🎯 Hybrid Recommendations for item: {item}")
    print(Fore.YELLOW + f"\nItem-Based CF Recommendations:")
    for similar_item in similar_items:
        print(f"→ {similar_item} (cosine similarity={round(item_similarities[similar_items_idx[0]], 2)})")

    print(Fore.GREEN + f"\nPopularity-Based Recommendations:")
    for popular_item, count in popular_items.items():
        print(f"→ {popular_item} (purchased {count} times)")

# ------------------------ Example Usage
item_based_recommendation('milk', user_item_matrix, cosine_sim)
popularity_based_recommendation(user_item_matrix)
hybrid_recommendation('milk', user_item_matrix, cosine_sim)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from pyECLAT import ECLAT
from itertools import product
from collections import defaultdict
from colorama import Fore, init

init(autoreset=True)

# ------------------------ Sample Transactions
transactions = [
    ['milk', 'bread', 'eggs'],
    ['milk', 'bread'],
    ['milk', 'cookies'],
    ['bread', 'butter'],
    ['milk', 'bread', 'butter', 'eggs'],
    ['cookies', 'soda'],
    ['milk', 'soda'],
]

# ------------------------ Encode Transactions
te = TransactionEncoder()
df = pd.DataFrame(te.fit_transform(transactions), columns=te.columns_)

# ------------------------ Apriori, FP-Growth, ECLAT: Generate Association Rules
support_range = [0.2, 0.3, 0.4]
confidence_range = [0.5, 0.6, 0.7]
rules_apriori = []
rules_fp = []
rules_eclat = []

# Apriori and FP-Growth Rule Generation
for support, confidence in product(support_range, confidence_range):
    # Apriori
    frequent_items_apriori = apriori(df, min_support=support, use_colnames=True)
    rules_apriori.extend(association_rules(frequent_items_apriori, metric="confidence", min_threshold=confidence).to_dict('records'))

    # FP-Growth
    frequent_items_fp = fpgrowth(df, min_support=support, use_colnames=True)
    rules_fp.extend(association_rules(frequent_items_fp, metric="confidence", min_threshold=confidence).to_dict('records'))

# Eclat Rule Generation
eclat_model = ECLAT(data=pd.DataFrame({'Transactions': transactions}), verbose=False)
rule_eclat = eclat_model.fit(min_support=0.3)
rules_eclat = pd.DataFrame(rule_eclat['rule_support'].items(), columns=['itemsets', 'support'])
rules_eclat['itemsets'] = rules_eclat['itemsets'].apply(lambda x: frozenset(x))
rules_eclat = rules_eclat[rules_eclat['itemsets'].apply(lambda x: len(x) >= 2)]
rules_eclat['confidence'] = None
rules_eclat['lift'] = None
rules_eclat['antecedents'] = rules_eclat['itemsets'].apply(lambda x: frozenset(list(x)[:-1]))
rules_eclat['consequents'] = rules_eclat['itemsets'].apply(lambda x: frozenset([list(x)[-1]]))

# ------------------------ Generate Association Rules DataFrame
rules_apriori_df = pd.DataFrame(rules_apriori)
rules_fp_df = pd.DataFrame(rules_fp)
rules_eclat_df = rules_eclat[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Combine all rules into one DataFrame
combined_rules = pd.concat([rules_apriori_df, rules_fp_df, rules_eclat_df], ignore_index=True)

# ------------------------ Prepare Data for Collaborative Filtering
user_item_matrix = pd.DataFrame(te.transform(transactions).toarray(), columns=te.columns_)
cosine_sim = cosine_similarity(user_item_matrix.T)

# ------------------------ Item-Based Collaborative Filtering
def item_based_recommendation(item, user_item_matrix, cosine_sim, top_k=5):
    item_idx = user_item_matrix.columns.get_loc(item)
    item_similarities = cosine_sim[item_idx]

    # Get the indices of the top_k most similar items
    similar_items_idx = np.argsort(item_similarities)[::-1][1:top_k+1]
    similar_items = user_item_matrix.columns[similar_items_idx]

    print(Fore.BLUE + f"\n🎯 Item-Based Recommendations for: {item}")
    for i, similar_item in enumerate(similar_items):
        print(f"→ {similar_item} (cosine similarity={round(item_similarities[similar_items_idx[i]], 2)})")

# ------------------------ Popularity-Based Recommendation
def popularity_based_recommendation(user_item_matrix, top_k=5):
    item_popularity = user_item_matrix.sum(axis=0)
    popular_items = item_popularity.sort_values(ascending=False).head(top_k)

    print(Fore.GREEN + "\n🎯 Popularity-Based Recommendations:")
    for item, count in popular_items.items():
        print(f"→ {item} (purchased {count} times)")

# ------------------------ Association Rule Recommendation
def association_based_recommendation(item, combined_rules, top_k=5):
    print(Fore.GREEN + f"\n🎯 Association-Based Recommendations for item: {item}")

    # Find rules where item is in the antecedent (left-hand side)
    item_rules = combined_rules[combined_rules['antecedents'].apply(lambda x: item in x)]

    # Sort rules by lift (stronger relationships have higher lift)
    item_rules = item_rules.sort_values(by='lift', ascending=False)

    # Recommend the top_k items based on the strongest rules
    recommended_items = set()
    for _, row in item_rules.head(top_k).iterrows():
        recommended_items.update(row['consequents'])

    # Remove the original item from recommendations
    recommended_items.discard(item)

    # Output recommendations
    for rec_item in recommended_items:
        print(f"→ {rec_item} (Lift={round(row['lift'], 2)}, Confidence={round(row['confidence'], 2)})")

# ------------------------ Hybrid Recommendation (Combination of CF + Association Rules)
def hybrid_recommendation(item, user_item_matrix, cosine_sim, combined_rules, top_k=5):
    print(Fore.MAGENTA + f"\n🎯 Hybrid Recommendations for: {item}")

    # Get recommendations using item-based CF
    item_similarities = cosine_sim[user_item_matrix.columns.get_loc(item)]
    similar_items_idx = np.argsort(item_similarities)[::-1][1:top_k+1]
    similar_items = user_item_matrix.columns[similar_items_idx]

    # Get association rule-based recommendations
    association_based_recommendation(item, combined_rules, top_k)

    print(Fore.YELLOW + f"\nItem-Based CF Recommendations:")
    for similar_item in similar_items:
        print(f"→ {similar_item} (cosine similarity={round(item_similarities[similar_items_idx[0]], 2)})")

# ------------------------ Example Usage
item_based_recommendation('milk', user_item_matrix, cosine_sim)
popularity_based_recommendation(user_item_matrix)
association_based_recommendation('milk', combined_rules, top_k=5)
hybrid_recommendation('milk', user_item_matrix, cosine_sim, combined_rules, top_k=5)