In [8]:
import random

import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, KNNBasic, Reader
from surprise.model_selection import train_test_split



In [9]:
print("Q1.1: First 10 faceplate transactions")
faceplate_df = pd.read_csv("Faceplate.csv")
display(faceplate_df.head(10))



Q1.1: First 10 faceplate transactions


Unnamed: 0,Transaction,Red,White,Blue,Orange,Green,Yellow
0,1,1,1,0,0,1,0
1,2,0,1,0,1,0,0
2,3,0,1,1,0,0,0
3,4,1,1,0,1,0,0
4,5,1,0,1,0,0,0
5,6,0,1,1,0,0,0
6,7,1,0,1,0,0,0
7,8,1,1,1,0,1,0
8,9,1,1,1,0,0,0
9,10,0,0,0,0,0,1


In [10]:
print("Q1.2: Support of itemset {red, white}")
red_white_transaction_count = ((faceplate_df["Red"] == 1) & (faceplate_df["White"] == 1)).sum()
faceplate_total_transactions = len(faceplate_df)
red_white_support = red_white_transaction_count / faceplate_total_transactions

print(f"Transactions containing Red and White: {red_white_transaction_count}")
print(f"Total transactions: {faceplate_total_transactions}")
print(f"Support of {{red, white}}: {red_white_support:.4f}")



Q1.2: Support of itemset {red, white}
Transactions containing Red and White: 4
Total transactions: 10
Support of {red, white}: 0.4000


In [11]:
print("Q2.1: Frequent itemsets with minimum support 0.2")
faceplate_binary_df = faceplate_df.drop(columns=["Transaction"])
faceplate_frequent_itemsets_df = apriori(
    faceplate_binary_df.astype(bool),
    min_support=0.2,
    use_colnames=True,
).sort_values(by=["support", "itemsets"], ascending=[False, True]).reset_index(drop=True)

display(faceplate_frequent_itemsets_df)



Q2.1: Frequent itemsets with minimum support 0.2


Unnamed: 0,support,itemsets
0,0.7,(White)
1,0.6,(Red)
2,0.6,(Blue)
3,0.4,"(Red, White)"
4,0.4,"(Red, Blue)"
5,0.4,"(White, Blue)"
6,0.2,(Orange)
7,0.2,(Green)
8,0.2,"(Red, Green)"
9,0.2,"(White, Orange)"


In [12]:
print("Q2.2: Association rules with minimum confidence 0.5")
faceplate_rules_df = association_rules(
    faceplate_frequent_itemsets_df,
    metric="confidence",
    min_threshold=0.5,
).sort_values(by="lift", ascending=False).reset_index(drop=True)

display(
    faceplate_rules_df[
        [
            "antecedents",
            "consequents",
            "support",
            "confidence",
            "lift",
            "leverage",
            "conviction",
            "antecedent support",
            "consequent support",
        ]
    ]
)



Q2.2: Association rules with minimum confidence 0.5


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction,antecedent support,consequent support
0,"(Red, White)",(Green),0.2,0.5,2.5,0.12,1.6,0.4,0.2
1,(Green),"(Red, White)",0.2,1.0,2.5,0.12,inf,0.2,0.4
2,(Green),(Red),0.2,1.0,1.666667,0.08,inf,0.2,0.6
3,"(White, Green)",(Red),0.2,1.0,1.666667,0.08,inf,0.2,0.6
4,(Orange),(White),0.2,1.0,1.428571,0.06,inf,0.2,0.7
5,(Green),(White),0.2,1.0,1.428571,0.06,inf,0.2,0.7
6,"(Red, Green)",(White),0.2,1.0,1.428571,0.06,inf,0.2,0.7
7,(Red),(Blue),0.4,0.666667,1.111111,0.04,1.2,0.6,0.6
8,(Blue),(Red),0.4,0.666667,1.111111,0.04,1.2,0.6,0.6
9,(Red),(White),0.4,0.666667,0.952381,-0.02,0.9,0.6,0.7


In [13]:
print("Q2.3: Top 6 rules by lift")
faceplate_top_6_rules_df = faceplate_rules_df.head(6).copy()
faceplate_top_6_rules_df = faceplate_top_6_rules_df[
    ["antecedents", "consequents", "support", "confidence", "lift", "leverage"]
]
faceplate_top_6_rules_df["antecedents"] = faceplate_top_6_rules_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
faceplate_top_6_rules_df["consequents"] = faceplate_top_6_rules_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)

display(faceplate_top_6_rules_df)



Q2.3: Top 6 rules by lift


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
0,"Red, White",Green,0.2,0.5,2.5,0.12
1,Green,"Red, White",0.2,1.0,2.5,0.12
2,Green,Red,0.2,1.0,1.666667,0.08
3,"Green, White",Red,0.2,1.0,1.666667,0.08
4,Orange,White,0.2,1.0,1.428571,0.06
5,Green,White,0.2,1.0,1.428571,0.06


In [14]:
print("Q2.4: Sentence for highest-lift rule")
highest_lift_faceplate_rule = faceplate_rules_df.iloc[0]
highest_lift_antecedent_text = ", ".join(sorted(highest_lift_faceplate_rule["antecedents"]))
highest_lift_consequent_text = ", ".join(sorted(highest_lift_faceplate_rule["consequents"]))
highest_lift_confidence_pct = highest_lift_faceplate_rule["confidence"] * 100
highest_lift_value = highest_lift_faceplate_rule["lift"]

print(
    f"If [{highest_lift_antecedent_text}] are purchased, then with confidence "
    f"{highest_lift_confidence_pct:.1f}% [{highest_lift_consequent_text}] will also be purchased. "
    f"This rule has a lift ratio of {highest_lift_value:.2f}."
)



Q2.4: Sentence for highest-lift rule
If [Red, White] are purchased, then with confidence 50.0% [Green] will also be purchased. This rule has a lift ratio of 2.50.


In [15]:
print("Q3.1: Book data binary incidence matrix")
book_raw_df = pd.read_csv("CharlesBookClub.csv")
book_excluded_columns = [
    "Seq#",
    "ID#",
    "Gender",
    "M",
    "R",
    "F",
    "FirstPurch",
    "Related Purchase",
    "Mcode",
    "Rcode",
    "Fcode",
    "Yes_Florence",
    "No_Florence",
]
book_item_df = book_raw_df.drop(columns=book_excluded_columns)
book_binary_df = (book_item_df > 0).astype(int)

display(book_binary_df.head(10))



Q3.1: Book data binary incidence matrix


Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks,ItalCook,ItalAtlas,ItalArt,Florence
0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,0,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,0


In [16]:
print("Q3.2: Apriori with minimum support count 200")
book_min_support_count = 200
book_total_transactions = len(book_binary_df)
book_min_support_ratio = book_min_support_count / book_total_transactions

book_frequent_itemsets_df = apriori(
    book_binary_df.astype(bool),
    min_support=book_min_support_ratio,
    use_colnames=True,
).sort_values(by="support", ascending=False).reset_index(drop=True)

print(f"Minimum support ratio used: {book_min_support_ratio:.4f}")
print(f"Number of frequent itemsets: {len(book_frequent_itemsets_df)}")
display(book_frequent_itemsets_df)



Q3.2: Apriori with minimum support count 200
Minimum support ratio used: 0.0500
Number of frequent itemsets: 61


Unnamed: 0,support,itemsets
0,0.41550,(CookBks)
1,0.39400,(ChildBks)
2,0.26675,(GeogBks)
3,0.25475,(DoItYBks)
4,0.24200,"(ChildBks, CookBks)"
...,...,...
56,0.05375,"(ChildBks, DoItYBks, ArtBks)"
57,0.05300,"(ArtBks, DoItYBks, CookBks)"
58,0.05150,"(CookBks, ArtBks, YouthBks)"
59,0.05100,"(ChildBks, GeogBks, ArtBks)"


In [17]:
print("Q3.3: Top 25 book rules by lift")
book_rules_df = association_rules(
    book_frequent_itemsets_df,
    metric="confidence",
    min_threshold=0.5,
)

book_top_25_rules_df = (
    book_rules_df.sort_values(by="lift", ascending=False)
    .loc[:, ["antecedents", "consequents", "support", "confidence", "lift", "leverage"]]
    .head(25)
    .reset_index(drop=True)
)
book_top_25_rules_df["antecedents"] = book_top_25_rules_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
book_top_25_rules_df["consequents"] = book_top_25_rules_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)

display(book_top_25_rules_df)



Q3.3: Top 25 book rules by lift


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
0,"RefBks, YouthBks","ChildBks, CookBks",0.05525,0.68,2.809917,0.035588
1,"DoItYBks, RefBks","ChildBks, CookBks",0.06125,0.662162,2.736207,0.038865
2,"DoItYBks, YouthBks","ChildBks, CookBks",0.067,0.64891,2.681448,0.042014
3,"GeogBks, RefBks","ChildBks, CookBks",0.05025,0.614679,2.539995,0.030467
4,"GeogBks, YouthBks","ChildBks, CookBks",0.06325,0.605263,2.501087,0.037961
5,"DoItYBks, GeogBks","ChildBks, CookBks",0.0605,0.59901,2.475248,0.036058
6,"ChildBks, CookBks, GeogBks",YouthBks,0.06325,0.577626,2.424452,0.037162
7,"ChildBks, CookBks, RefBks",DoItYBks,0.06125,0.591787,2.323013,0.034883
8,"DoItYBks, GeogBks",YouthBks,0.0545,0.539604,2.264864,0.030437
9,"ChildBks, CookBks, RefBks",YouthBks,0.05525,0.533816,2.240573,0.030591


In [18]:
print("Q4.1: Rule with highest support from book rules")
book_highest_support_rule_df = (
    book_rules_df.sort_values(by="support", ascending=False)
    .head(1)
    .loc[:, ["antecedents", "consequents", "support", "confidence", "lift"]]
    .copy()
)
book_highest_support_rule_df["antecedents"] = book_highest_support_rule_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
book_highest_support_rule_df["consequents"] = book_highest_support_rule_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)

display(book_highest_support_rule_df)



Q4.1: Rule with highest support from book rules


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,ChildBks,CookBks,0.242,0.614213,1.478251


In [19]:
print("Q4.2: Highest support rule vs highest lift rule")
book_highest_support_rule = book_rules_df.sort_values(by="support", ascending=False).iloc[0]
book_highest_lift_rule = book_rules_df.sort_values(by="lift", ascending=False).iloc[0]

book_rule_comparison_df = pd.DataFrame(
    [
        {
            "rule_type": "Highest Support Rule",
            "antecedents": ", ".join(sorted(book_highest_support_rule["antecedents"])),
            "consequents": ", ".join(sorted(book_highest_support_rule["consequents"])),
            "support": book_highest_support_rule["support"],
            "confidence": book_highest_support_rule["confidence"],
            "lift": book_highest_support_rule["lift"],
        },
        {
            "rule_type": "Highest Lift Rule",
            "antecedents": ", ".join(sorted(book_highest_lift_rule["antecedents"])),
            "consequents": ", ".join(sorted(book_highest_lift_rule["consequents"])),
            "support": book_highest_lift_rule["support"],
            "confidence": book_highest_lift_rule["confidence"],
            "lift": book_highest_lift_rule["lift"],
        },
    ]
)

display(book_rule_comparison_df)
print(
    f"Highest-lift rule has lift={book_highest_lift_rule['lift']:.3f} and support={book_highest_lift_rule['support']:.3f}."
)
print(
    f"Highest-support rule has support={book_highest_support_rule['support']:.3f} and lift={book_highest_support_rule['lift']:.3f}."
)
print(
    "High-lift rules are more efficient but often cover fewer transactions; high-support rules affect more transactions but are less selective."
)



Q4.2: Highest support rule vs highest lift rule


Unnamed: 0,rule_type,antecedents,consequents,support,confidence,lift
0,Highest Support Rule,ChildBks,CookBks,0.242,0.614213,1.478251
1,Highest Lift Rule,"RefBks, YouthBks","ChildBks, CookBks",0.05525,0.68,2.809917


Highest-lift rule has lift=2.810 and support=0.055.
Highest-support rule has support=0.242 and lift=1.478.
High-lift rules are more efficient but often cover fewer transactions; high-support rules affect more transactions but are less selective.


In [20]:
print("Q4.3: Top 10 by lift and lowest confidence among them")
book_top_10_lift_rules_df = book_rules_df.sort_values(by="lift", ascending=False).head(10).copy()
book_lowest_confidence_in_top_10_df = (
    book_top_10_lift_rules_df.sort_values(by="confidence", ascending=True)
    .head(1)
    .loc[:, ["antecedents", "consequents", "support", "confidence", "lift", "leverage"]]
    .copy()
)

book_top_10_lift_display_df = book_top_10_lift_rules_df[
    ["antecedents", "consequents", "support", "confidence", "lift", "leverage"]
].copy()
book_top_10_lift_display_df["antecedents"] = book_top_10_lift_display_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
book_top_10_lift_display_df["consequents"] = book_top_10_lift_display_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
book_lowest_confidence_in_top_10_df["antecedents"] = book_lowest_confidence_in_top_10_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
book_lowest_confidence_in_top_10_df["consequents"] = book_lowest_confidence_in_top_10_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)

display(book_top_10_lift_display_df.reset_index(drop=True))
display(book_lowest_confidence_in_top_10_df.reset_index(drop=True))



Q4.3: Top 10 by lift and lowest confidence among them


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
0,"RefBks, YouthBks","ChildBks, CookBks",0.05525,0.68,2.809917,0.035588
1,"DoItYBks, RefBks","ChildBks, CookBks",0.06125,0.662162,2.736207,0.038865
2,"DoItYBks, YouthBks","ChildBks, CookBks",0.067,0.64891,2.681448,0.042014
3,"GeogBks, RefBks","ChildBks, CookBks",0.05025,0.614679,2.539995,0.030467
4,"GeogBks, YouthBks","ChildBks, CookBks",0.06325,0.605263,2.501087,0.037961
5,"DoItYBks, GeogBks","ChildBks, CookBks",0.0605,0.59901,2.475248,0.036058
6,"ChildBks, CookBks, GeogBks",YouthBks,0.06325,0.577626,2.424452,0.037162
7,"ChildBks, CookBks, RefBks",DoItYBks,0.06125,0.591787,2.323013,0.034883
8,"DoItYBks, GeogBks",YouthBks,0.0545,0.539604,2.264864,0.030437
9,"ChildBks, CookBks, RefBks",YouthBks,0.05525,0.533816,2.240573,0.030591


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
0,"ChildBks, CookBks, RefBks",YouthBks,0.05525,0.533816,2.240573,0.030591


In [21]:
print("Q5.1: Synthetic binary dataset (50 transactions, 9 items)")
random.seed(0)
synthetic_transaction_count = 50
synthetic_item_count = 9
synthetic_item_columns = [f"Item_{item_index}" for item_index in range(1, synthetic_item_count + 1)]

synthetic_transaction_rows = [
    [random.randint(0, 1) for _ in range(synthetic_item_count)]
    for _ in range(synthetic_transaction_count)
]
synthetic_binary_df = pd.DataFrame(synthetic_transaction_rows, columns=synthetic_item_columns)

display(synthetic_binary_df.head(10))
print(f"Shape: {synthetic_binary_df.shape}")



Q5.1: Synthetic binary dataset (50 transactions, 9 items)


Unnamed: 0,Item_1,Item_2,Item_3,Item_4,Item_5,Item_6,Item_7,Item_8,Item_9
0,1,1,0,1,1,1,1,1,1
1,0,0,1,0,0,1,0,1,0
2,0,1,1,0,1,1,1,0,1
3,1,1,0,0,0,1,0,1,1
4,0,1,0,0,0,0,0,1,0
5,0,1,1,0,1,1,0,1,0
6,1,1,0,1,1,0,1,0,0
7,0,0,1,1,0,0,0,0,0
8,0,1,1,0,0,1,1,1,1
9,1,0,1,0,1,1,0,0,0


Shape: (50, 9)


In [22]:
print("Q5.2: Apriori on synthetic data (support count=2, confidence=0.7)")
synthetic_min_support_count = 2
synthetic_min_support_ratio = synthetic_min_support_count / len(synthetic_binary_df)

synthetic_frequent_itemsets_df = apriori(
    synthetic_binary_df.astype(bool),
    min_support=synthetic_min_support_ratio,
    use_colnames=True,
).sort_values(by="support", ascending=False).reset_index(drop=True)

synthetic_rules_df = association_rules(
    synthetic_frequent_itemsets_df,
    metric="confidence",
    min_threshold=0.7,
).sort_values(by="lift", ascending=False).reset_index(drop=True)

print(f"Minimum support ratio used: {synthetic_min_support_ratio:.4f}")
print(f"Frequent itemsets found: {len(synthetic_frequent_itemsets_df)}")
print(f"Association rules found: {len(synthetic_rules_df)}")
display(synthetic_frequent_itemsets_df)
display(synthetic_rules_df)



Q5.2: Apriori on synthetic data (support count=2, confidence=0.7)
Minimum support ratio used: 0.0400
Frequent itemsets found: 256
Association rules found: 179


Unnamed: 0,support,itemsets
0,0.64,(Item_6)
1,0.64,(Item_2)
2,0.56,(Item_1)
3,0.48,(Item_4)
4,0.44,(Item_7)
...,...,...
251,0.04,"(Item_4, Item_7, Item_6, Item_2)"
252,0.04,"(Item_4, Item_5, Item_8, Item_2)"
253,0.04,"(Item_4, Item_5, Item_6, Item_2)"
254,0.04,"(Item_4, Item_3, Item_6)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,"(Item_9, Item_7, Item_1, Item_2)","(Item_4, Item_8)",0.04,0.14,0.04,1.000000,7.142857,1.0,0.0344,inf,0.895833,0.285714,1.000000,0.642857
1,"(Item_9, Item_1, Item_7)","(Item_4, Item_8)",0.10,0.14,0.08,0.800000,5.714286,1.0,0.0660,4.30,0.916667,0.500000,0.767442,0.685714
2,"(Item_4, Item_7, Item_8, Item_2)","(Item_9, Item_1)",0.04,0.20,0.04,1.000000,5.000000,1.0,0.0320,inf,0.833333,0.200000,1.000000,0.600000
3,"(Item_9, Item_1, Item_6, Item_7)","(Item_4, Item_5)",0.04,0.22,0.04,1.000000,4.545455,1.0,0.0312,inf,0.812500,0.181818,1.000000,0.590909
4,"(Item_9, Item_5, Item_1, Item_7)","(Item_4, Item_6)",0.04,0.24,0.04,1.000000,4.166667,1.0,0.0304,inf,0.791667,0.166667,1.000000,0.583333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,"(Item_5, Item_7)",(Item_2),0.22,0.64,0.16,0.727273,1.136364,1.0,0.0192,1.32,0.153846,0.228571,0.242424,0.488636
175,"(Item_5, Item_7)",(Item_6),0.22,0.64,0.16,0.727273,1.136364,1.0,0.0192,1.32,0.153846,0.228571,0.242424,0.488636
176,"(Item_9, Item_1, Item_2)",(Item_6),0.14,0.64,0.10,0.714286,1.116071,1.0,0.0104,1.26,0.120930,0.147059,0.206349,0.435268
177,"(Item_5, Item_1, Item_2)",(Item_6),0.14,0.64,0.10,0.714286,1.116071,1.0,0.0104,1.26,0.120930,0.147059,0.206349,0.435268


In [23]:
print("Q5.3: Top 6 rules by uplift ratio (lift)")
synthetic_top_6_uplift_rules_df = (
    synthetic_rules_df.sort_values(by="lift", ascending=False)
    .loc[:, ["antecedents", "consequents", "support", "confidence", "lift"]]
    .head(6)
    .reset_index(drop=True)
)
synthetic_top_6_uplift_rules_df["antecedents"] = synthetic_top_6_uplift_rules_df["antecedents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)
synthetic_top_6_uplift_rules_df["consequents"] = synthetic_top_6_uplift_rules_df["consequents"].apply(
    lambda itemset: ", ".join(sorted(itemset))
)

display(synthetic_top_6_uplift_rules_df)
print(
    f"Highest uplift ratio observed: {synthetic_top_6_uplift_rules_df['lift'].max():.3f}."
)
print(
    "Yes, high uplift can still appear in random data due to chance, especially when support is low."
)



Q5.3: Top 6 rules by uplift ratio (lift)


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"Item_1, Item_2, Item_7, Item_9","Item_4, Item_8",0.04,1.0,7.142857
1,"Item_1, Item_7, Item_9","Item_4, Item_8",0.08,0.8,5.714286
2,"Item_2, Item_4, Item_7, Item_8","Item_1, Item_9",0.04,1.0,5.0
3,"Item_1, Item_6, Item_7, Item_9","Item_4, Item_5",0.04,1.0,4.545455
4,"Item_1, Item_5, Item_7, Item_9","Item_4, Item_6",0.04,1.0,4.166667
5,"Item_5, Item_7, Item_8","Item_1, Item_2, Item_6",0.04,1.0,4.166667


Highest uplift ratio observed: 7.143.
Yes, high uplift can still appear in random data due to chance, especially when support is low.


In [24]:
print("Q6.1: Synthetic ratings dataset (seed=0, 5000 ratings)")
np.random.seed(0)
ratings_count = 5000
ratings_df = pd.DataFrame(
    {
        "itemID": np.random.randint(0, 100, size=ratings_count),
        "userID": np.random.randint(0, 1000, size=ratings_count),
        "rating": np.random.randint(1, 6, size=ratings_count),
    }
)

display(ratings_df.head(10))
print(f"Shape: {ratings_df.shape}")



Q6.1: Synthetic ratings dataset (seed=0, 5000 ratings)


Unnamed: 0,itemID,userID,rating
0,44,187,3
1,47,507,3
2,64,493,2
3,67,183,1
4,67,893,3
5,9,673,4
6,83,267,3
7,21,639,1
8,36,987,2
9,87,802,1


Shape: (5000, 3)


In [25]:
print("Q6.2: Surprise dataset conversion and train/test dimensions")
surprise_ratings_df = ratings_df[["userID", "itemID", "rating"]].copy()
surprise_reader = Reader(rating_scale=(1, 5))
surprise_dataset = Dataset.load_from_df(surprise_ratings_df, surprise_reader)
trainset, testset = train_test_split(surprise_dataset, test_size=0.2, random_state=0)

print(f"Train ratings: {trainset.n_ratings}")
print(f"Train users: {trainset.n_users}")
print(f"Train items: {trainset.n_items}")
print(f"Test ratings: {len(testset)}")
print(f"Test users: {len({user_id for user_id, _, _ in testset})}")
print(f"Test items: {len({item_id for _, item_id, _ in testset})}")



Q6.2: Surprise dataset conversion and train/test dimensions
Train ratings: 4000
Train users: 982
Train items: 100
Test ratings: 1000
Test users: 640
Test items: 100


In [26]:
print("Q6.3: User cosine similarity and item-based CF model")
user_item_rating_matrix_df = ratings_df.pivot_table(
    index="userID",
    columns="itemID",
    values="rating",
    aggfunc="mean",
    fill_value=0,
)
user_similarity_matrix = cosine_similarity(user_item_rating_matrix_df)
user_similarity_df = pd.DataFrame(
    user_similarity_matrix,
    index=user_item_rating_matrix_df.index,
    columns=user_item_rating_matrix_df.index,
)

display(user_similarity_df.iloc[:5, :5])

item_based_similarity_options = {"name": "cosine", "user_based": False}
item_based_model = KNNBasic(sim_options=item_based_similarity_options, verbose=False)
item_based_model.fit(trainset)
print("Item-based collaborative filtering model trained.")



Q6.3: User cosine similarity and item-based CF model


userID,0,1,2,3,4
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,0.0,0.444116,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.444116,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0


Item-based collaborative filtering model trained.


In [27]:
print("Q6.4: Predict missing user-item ratings and list top recommendations")
recommendation_count = 5
raw_user_ids = [trainset.to_raw_uid(inner_user_id) for inner_user_id in trainset.all_users()]
raw_item_ids = [trainset.to_raw_iid(inner_item_id) for inner_item_id in trainset.all_items()]

observed_training_pairs = {
    (trainset.to_raw_uid(inner_user_id), trainset.to_raw_iid(inner_item_id))
    for inner_user_id, inner_item_id, _ in trainset.all_ratings()
}

missing_pair_predictions = []
for raw_user_id in raw_user_ids:
    for raw_item_id in raw_item_ids:
        if (raw_user_id, raw_item_id) not in observed_training_pairs:
            predicted_rating = item_based_model.predict(raw_user_id, raw_item_id).est
            missing_pair_predictions.append((raw_user_id, raw_item_id, predicted_rating))

missing_predictions_df = pd.DataFrame(
    missing_pair_predictions,
    columns=["userID", "itemID", "estimated_rating"],
)

user_recommendations_df = (
    missing_predictions_df.sort_values(["userID", "estimated_rating"], ascending=[True, False])
    .groupby("userID", as_index=False)
    .head(recommendation_count)
    .groupby("userID")["itemID"]
    .apply(list)
    .reset_index(name=f"recommended_top_{recommendation_count}_items")
)

print(f"Predictions generated: {len(missing_predictions_df)}")
display(user_recommendations_df)



Q6.4: Predict missing user-item ratings and list top recommendations
Predictions generated: 94264


Unnamed: 0,userID,recommended_top_5_items
0,0,"[50, 59, 72, 26, 12]"
1,2,"[27, 14, 63, 44, 64]"
2,3,"[55, 35, 81, 12, 60]"
3,4,"[14, 25, 45, 7, 34]"
4,5,"[52, 84, 10, 48, 49]"
...,...,...
977,995,"[31, 77, 99, 93, 36]"
978,996,"[76, 93, 44, 0, 65]"
979,997,"[27, 99, 86, 54, 55]"
980,998,"[76, 2, 38, 29, 52]"
