In [39]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
file_path = r"C:\Users\sai\OneDrive\Desktop\Online retail.csv.xlsx"
df = pd.read_excel(file_path)

# Print the current columns and their count
print("Current Columns:")
print(df.columns)

# Check the number of columns
print(f"Number of columns in dataset: {len(df.columns)}")

# If the columns are not as expected, check the column names manually
print("Column Names and Data Types:")
print(df.dtypes)

# Rename columns if necessary, based on actual data
# Here, print actual columns to decide on renaming
print("Column Names:")
print(df.columns)

# If expected columns are not present, use actual column names found
# Assuming the dataset actually has columns named 'InvoiceNo', 'StockCode', 'Description', and 'Quantity'
# Adjust the names if they are different
expected_columns = ['InvoiceNo', 'StockCode', 'Description', 'Quantity']

# Rename columns if they do not match expected names
# For demonstration, let's print the first few rows to understand the data structure
print("First few rows of the dataset:")
print(df.head())

# Check if columns exist and rename if necessary
# For simplicity, let's assume you manually identify the column names

# Example adjustment
# df.columns = ['InvoiceNo', 'StockCode', 'Description', 'Quantity']

# Proceed with preprocessing only if columns exist
if all(col in df.columns for col in expected_columns):
    # Drop rows with missing values in 'InvoiceNo', 'StockCode', or 'Description'
    df.dropna(subset=['InvoiceNo', 'StockCode', 'Description'], inplace=True)
    
    # Convert 'InvoiceNo' and 'StockCode' to string type
    df['InvoiceNo'] = df['InvoiceNo'].astype(str)
    df['StockCode'] = df['StockCode'].astype(str)
    
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    
    # Remove cancelled orders (InvoiceNo starting with 'C')
    df = df[~df['InvoiceNo'].str.startswith('C')]
    
    # Create basket matrix
    basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
    
    # Convert quantities to 1 (purchased) or 0 (not purchased)
    basket = basket.applymap(lambda x: 1 if x > 0 else 0)
    
    # Print the first few rows of the basket matrix
    print("First few rows of the basket matrix:")
    print(basket.head())
    
    # Apply the Apriori algorithm
    frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
    print("Frequent Itemsets:")
    print(frequent_itemsets.head())
    
    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    print("Association Rules:")
    print(rules.head())
    
    # Analysis and Interpretation
    print("Summary of Association Rules:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False))
    
    # Example Insights
    print("\nInsights:")
    for index, row in rules.iterrows():
        antecedent = list(row['antecedents'])
        consequent = list(row['consequents'])
        print(f"If a customer buys {', '.join(antecedent)}, they are {row['confidence']*100:.2f}% likely to also buy {', '.join(consequent)}. Lift: {row['lift']:.2f}")
else:
    print("Required columns are not present in the DataFrame.")


Current Columns:
Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')
Number of columns in dataset: 1
Column Names and Data Types:
shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    object
dtype: object
Column Names:
Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')
First few rows of the dataset:
  shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yog

In [38]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Load the dataset
file_path = r"C:\Users\sai\OneDrive\Documents\Online retail.xlsx"
online_retail_data = pd.read_excel(file_path)

# Check column names
print(online_retail_data.columns)

# Data Preprocessing
online_retail_data.dropna(inplace=True)
online_retail_data.drop_duplicates(inplace=True)

# Check if 'InvoiceNo' and 'StockCode' exist
if 'InvoiceNo' in online_retail_data.columns and 'StockCode' in online_retail_data.columns:
    # Create a binary DataFrame
    binary_df = online_retail_data.groupby('InvoiceNo')['StockCode'].apply(lambda x: pd.Series(1, index=x)).unstack().fillna(0).astype(bool).astype(int)
else:
    print("Columns 'InvoiceNo' and 'StockCode' do not exist")

# Apply Apriori algorithm
if 'binary_df' in locals():
    frequent_itemsets = apriori(binary_df, min_support=0.01, use_colnames=True)
else:
    print("Cannot apply Apriori algorithm because binary DataFrame does not exist")

# Generate association rules
if 'frequent_itemsets' in locals():
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
else:
    print("Cannot generate association rules because frequent itemsets do not exist")

# Analysis and Interpretation
if 'rules' in locals():
    print("Association Rules:")
    print(rules.head())
    print("Summary of Association Rules:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False))

    print("\nInsights:")
    for index, row in rules.iterrows():
        antecedent = list(row['antecedents'])
        consequent = list(row['consequents'])
        print(f"If a customer buys {', '.join(antecedent)}, they are {row['confidence']*100:.2f}% likely to also buy {', '.join(consequent)}. Lift: {row['lift']:.2f}")
else:
    print("Cannot perform analysis and interpretation because rules do not exist")

Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')
Columns 'InvoiceNo' and 'StockCode' do not exist
Cannot apply Apriori algorithm because binary DataFrame does not exist
Cannot generate association rules because frequent itemsets do not exist
Cannot perform analysis and interpretation because rules do not exist
