In [None]:
import pandas as pd
from google.colab import drive
from mlxtend.preprocessing import TransactionEncoder
from itertools import combinations
from mlxtend.frequent_patterns import association_rules
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import zipfile
import requests
import os
import numpy as np

In [None]:
drive.mount('/content/drive')
!unzip /content/drive/MyDrive/CP421/online_retail.zip -d Data

In [None]:
file_path = "/content/Data/Online Retail.xlsx"
df = pd.read_excel(file_path)

In [None]:
df = df.dropna(subset=['CustomerID', 'StockCode'])
df = df[df['Quantity'] > 0]
non_product_codes = ['BANK CHARGES', 'C2', 'DOT', 'M', 'PADS', 'POST']
df = df[~df['StockCode'].isin(non_product_codes)]

In [6]:
print(df.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


  and should_run_async(code)


In [7]:
# Ensures StockCode is treated as a string
df['StockCode'] = df['StockCode'].astype(str)

transactional_data = df.groupby('InvoiceNo')['StockCode'].apply(set).reset_index()
transactions = transactional_data['StockCode'].apply(list).tolist()

te = TransactionEncoder()
binary_matrix = te.fit(transactions).transform(transactions)

# Convert to DataFrame
binary_df = pd.DataFrame(binary_matrix, columns=te.columns_, index=transactional_data['InvoiceNo'])
print(binary_df.head())

  and should_run_async(code)


           10002  10080  10120  10123C  10124A  10124G  10125  10133  10135  \
InvoiceNo                                                                     
536365     False  False  False   False   False   False  False  False  False   
536366     False  False  False   False   False   False  False  False  False   
536367     False  False  False   False   False   False  False  False  False   
536368     False  False  False   False   False   False  False  False  False   
536369     False  False  False   False   False   False  False  False  False   

           11001  ...  90214O  90214P  90214R  90214S  90214T  90214U  90214V  \
InvoiceNo         ...                                                           
536365     False  ...   False   False   False   False   False   False   False   
536366     False  ...   False   False   False   False   False   False   False   
536367     False  ...   False   False   False   False   False   False   False   
536368     False  ...   False   False   F

In [8]:
min_sup = 0.02
num_transactions = len(binary_df)

# Function to calculate support
def get_support(itemset, transactions):
    count = sum(transactions[list(itemset)].all(axis=1))
    return count / num_transactions

# this generates frequent 1-itemsets
frequent_itemsets = {}
single_items = binary_df.columns
L1 = {frozenset([item]): get_support([item], binary_df) for item in single_items}
L1 = {k: v for k, v in L1.items() if v >= min_sup}
frequent_itemsets[1] = L1

# this generates all higher-order frequent itemsets
k = 2
while len(frequent_itemsets[k - 1]) > 0:
    prev_Lk = list(frequent_itemsets[k - 1].keys())
    candidate_itemsets = {frozenset(x) for x in combinations(set().union(*prev_Lk), k)}

    Lk = {itemset: get_support(itemset, binary_df) for itemset in candidate_itemsets if get_support(itemset, binary_df) >= min_sup}
    if len(Lk) == 0:
        break
    frequent_itemsets[k] = Lk
    k += 1

k_itemset_counts = {k: len(v) for k, v in frequent_itemsets.items()}

result_df = pd.DataFrame(list(k_itemset_counts.items()), columns=['k', 'Number of k-itemsets'])

print(result_df.head())


  and should_run_async(code)


   k  Number of k-itemsets
0  1                   208
1  2                    41
2  3                     1


In [9]:

# converts  frequent itemsets into format for  mlxtend
frequent_itemsets_df = pd.DataFrame([
    {'itemsets': itemset, 'support': support} for k, itemsets in frequent_itemsets.items() for itemset, support in itemsets.items()
])

# creates association rules
rules = association_rules(frequent_itemsets_df, metric="confidence", min_threshold=0.5)

rules_sorted = rules.sort_values(by="confidence", ascending=False)

total_rules = len(rules_sorted)
print(f"Total Association Rules: {total_rules}")

print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(5))

Total Association Rules: 32
       antecedents consequents   support  confidence       lift
28  (22698, 22699)     (22697)  0.021190    0.894495  23.825164
27  (22698, 22697)     (22699)  0.021190    0.847826  19.928786
9          (22698)     (22697)  0.024993    0.827338  22.036408
22         (22698)     (22699)  0.023689    0.784173  18.432564
0          (22697)     (22699)  0.029394    0.782923  18.403197


  and should_run_async(code)


In [None]:
url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
zip_path = "ml-100k.zip"
extract_path = "ml-100k"

# Download dataset
if not os.path.exists(zip_path):
    response = requests.get(url)
    with open(zip_path, "wb") as file:
        file.write(response.content)

# Extract dataset
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall()

# Load ratings data
ratings_path = os.path.join(extract_path, "u.data")
column_names = ["user_id", "movie_id", "rating", "timestamp"]
ratings_df = pd.read_csv(ratings_path, sep="\t", names=column_names)

In [None]:
# Filters users with at least 20 ratings
user_counts = ratings_df["user_id"].value_counts()
active_users = user_counts[user_counts >= 20].index
filtered_ratings = ratings_df[ratings_df["user_id"].isin(active_users)]

# Filters movies rated by at least 20 users
movie_counts = filtered_ratings["movie_id"].value_counts()
popular_movies = movie_counts[movie_counts >= 20].index
filtered_ratings = filtered_ratings[filtered_ratings["movie_id"].isin(popular_movies)]

In [None]:
filtered_ratings = filtered_ratings.sort_values(by="timestamp")

# Performs sequential 80-20 split
train_df, test_df = train_test_split(filtered_ratings, test_size=0.2, shuffle=False)

# Ensure all users and movies in the test set exist in the training set
train_users = set(train_df["user_id"])
train_movies = set(train_df["movie_id"])

test_df = test_df[test_df["user_id"].isin(train_users) & test_df["movie_id"].isin(train_movies)]

In [None]:
# Create user-item matrix
train_matrix = train_df.pivot(index="user_id", columns="movie_id", values="rating").fillna(0).to_numpy()

# SVD decomposition
U, S, Vt = np.linalg.svd(train_matrix, full_matrices=False)

# Converts to a diagonal matrix
S = np.diag(S)

In [None]:
def reconstruct_matrix(k):
    S_k = S[:k, :k]  # only the top-k singular values
    U_k = U[:, :k]    # first k columns of U
    Vt_k = Vt[:k, :]  # first k rows of Vt

    return np.dot(np.dot(U_k, S_k), Vt_k)

def evaluate_predictions(true_df, predicted_matrix):
    true_values = []
    predicted_values = []

    for _, row in true_df.iterrows():
        user_id, movie_id, true_rating = row["user_id"], row["movie_id"], row["rating"]

        # Get predicted rating
        if user_id in predicted_matrix.index and movie_id in predicted_matrix.columns:
            predicted_rating = predicted_matrix.loc[user_id, movie_id]
            true_values.append(true_rating)
            predicted_values.append(predicted_rating)

    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    mae = mean_absolute_error(true_values, predicted_values)

    return rmse, mae


In [24]:
k_values = [5, 10, 20, 40]
results = {}

for k in k_values:
    # Reconstruct matrix
    reconstructed_matrix = reconstruct_matrix(k)

    # Convert reconstructed matrix to DataFrame
    predicted_ratings = pd.DataFrame(reconstructed_matrix,
                                     index=train_df["user_id"].unique(),
                                     columns=train_df["movie_id"].unique())

    rmse, mae = evaluate_predictions(test_df, predicted_ratings)
    results[k] = {"RMSE": rmse, "MAE": mae}

# Converts to a DataFrame
results_df = pd.DataFrame(results).T

# Display results
print(results_df)

  and should_run_async(code)


        RMSE       MAE
5   3.349949  3.067795
10  3.355537  3.063695
20  3.375836  3.076674
40  3.396249  3.089789


Lower values of k such as k=5 perform best, as increasing k leads to slight
overfitting with diminishing improvements. Matrix factorization captures latent user-item
relationships and handles sparse data better, making it good for recommendations.
However, it struggles with the cold start problem, scalability, and it also assumes linear
interactions, limiting its ability to capture complex preferences.