## Improved CF technique leveraging community detection in social network analysis

Ideas:
- Use Last.FM dataset (music recommendation with user to user fiendship data)

In [1]:
%%capture
%pip install pandas numpy surprise scikit-learn networkx matplotlib

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt

## Reading and prerpocssing the data

In [3]:
# Step 1: Load and preprocess MovieLens 100K ratings
ratings_path100k = './Data/ml-100k/u.data'
users_path100k = './Data/ml-100k/u.user'
items_path100k = './Data/ml-100k/u.item'

# Read raw ratings file
ratings = pd.read_csv(ratings_path100k, sep='\t', names=["user_id", "item_id", "rating", "timestamp"]).drop(columns=["timestamp"])
users = pd.read_csv(users_path100k, sep='|', names=["user_id", "age", "gender", "occupation", "zip_code"]).drop(columns=["zip_code"])
items = pd.read_csv(items_path100k, sep='|', names=["item_id", "title", "release_date", "na", "URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]).drop(columns=["title", "release_date", "na", "URL"])

display(ratings.head())
display(users.head())
display(items.head())

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


Unnamed: 0,user_id,age,gender,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


Unnamed: 0,item_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [4]:
# Build user-item rating matrix
rating_matrix = ratings.pivot(index='user_id', columns='item_id', values='rating')
rating_matrix_filled = rating_matrix.fillna(0)
# Display a small part of the rating matrix
print("\nUser-Item Rating Matrix (first 10 users, first 10 items):")
display(rating_matrix_filled.iloc[:10, :10])


User-Item Rating Matrix (first 10 users, first 10 items):


item_id,1,2,3,4,5,6,7,8,9,10
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0


In [5]:
def build_user_genre_matrix(ratings, items, threshold=3.5):
    # Merge ratings with item genre info
    merged = ratings.merge(items, on='item_id')

    # Get list of genre columns (assuming they are all except 'item_id')
    genre_columns = [col for col in items.columns if col != 'item_id']

    # Multiply ratings with genre indicators
    for genre in genre_columns:
        merged[genre] = merged[genre] * merged['rating']

    # Group by user and compute average rating per genre
    genre_sums = merged.groupby('user_id')[genre_columns].sum()
    genre_counts = merged.groupby('user_id')[genre_columns].apply(lambda x: (x != 0).sum())
    genre_avgs = genre_sums / genre_counts

    # Binary thresholding
    user_genre_binary = (genre_avgs > threshold).astype(int)

    user_genre_binary.index.name = 'user_id'

    return user_genre_binary

In [6]:
binary_user_genre_matrix = build_user_genre_matrix(ratings, items, threshold=3.5)
binary_user_genre_matrix.head()

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1,0,0,0,0,0,0,1,1,0,1,0,0,1,1,1,1,1,1
2,0,1,1,1,0,1,1,0,1,0,1,0,0,0,1,1,1,1,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,0
5,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0


## Community detection algorithms

In [7]:

def local_contribution(G, C, node):
    """Calculate local contribution q for adding node to community C."""
    C_union = C | {node}
    subgraph = G.subgraph(C_union)
    Lin = subgraph.number_of_edges()
    Lout = sum(1 for u in C_union for v in G.neighbors(u) if v not in C_union)
    return Lin / (Lin + Lout) if (Lin + Lout) > 0 else 0

def community_mining(G):
    """Stage 1: Detect initial overlapping communities based on central nodes."""
    labeled = set()
    communities = []

    while len(labeled) < G.number_of_nodes():
        # Pick highest-degree unlabeled node as seed
        candidates = [n for n in G.nodes if n not in labeled]
        seed = max(candidates, key=G.degree)
        C = {seed}
        labeled.add(seed)
        Q = 0

        while True:
            neighbors = {v for u in C for v in G.neighbors(u) if v not in C}
            if not neighbors:
                break

            contributions = {j: local_contribution(G, C, j) for j in neighbors}
            j_star, q_max = max(contributions.items(), key=lambda item: item[1])

            if q_max >= Q:
                C.add(j_star)
                labeled.add(j_star)
                Q = q_max
            else:
                break

        communities.append(C)
    return communities

def merge_overlapping_communities(communities, threshold=0.7):
    """Stage 2: Merge communities with high overlap."""
    merged = True
    while merged:
        merged = False
        new_communities = []
        used = [False] * len(communities)

        for i, Ci in enumerate(communities):
            if used[i]:
                continue
            merged_comm = set(Ci)
            used[i] = True

            for j in range(i + 1, len(communities)):
                if used[j]:
                    continue
                Cj = communities[j]
                S = len(merged_comm & Cj) / len(merged_comm | Cj)
                if S >= threshold:
                    merged_comm |= Cj
                    used[j] = True
                    merged = True

            new_communities.append(merged_comm)

        communities = new_communities
    return communities

def central_node_overlapping_communities(G, overlap_threshold=0.7):
    """
    Central-node based overlapping community detection.
    Args:
        G: networkx Graph
        overlap_threshold: overlap ratio to merge communities
    Returns:
        List of sets, each a community
    """
    initial_comms = community_mining(G)
    final_comms = merge_overlapping_communities(initial_comms, threshold=overlap_threshold)
    return final_comms


# Test on Karate Club
G = nx.karate_club_graph()
print(f"Karate club graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
comms = central_node_overlapping_communities(G)
print("Detected communities:")
for i, comm in enumerate(comms, 1):
    print(f"Community {i}: {sorted(comm)}")


Karate club graph has 34 nodes and 78 edges.
Detected communities:
Community 1: [2, 8, 9, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
Community 2: [0, 1, 2, 3, 7, 8, 9, 11, 12, 13, 17, 19, 21, 30]
Community 3: [4, 5, 6, 10, 16]


In [8]:
import networkx as nx
from networkx.algorithms.clique import find_cliques

def k_faction_community_detection(G, k=4, T=0.6, CONN=0.5):
    # Step 1: Find maximal cliques (factions)
    cliques = [set(c) for c in find_cliques(G) if len(c) >= k]

    # Step 2: Merge overlapping cliques into initial communities
    communities = []
    for clique in cliques:
        merged = False
        for i, existing in enumerate(communities):
            overlap = len(clique & existing) / min(len(clique), len(existing))
            if overlap >= T:
                communities[i] = existing | clique
                merged = True
                break
        if not merged:
            communities.append(clique)

    # Step 3: Merge communities based on connectivity
    def interconnectivity(comm1, comm2):
        inter_edges = 0
        total_edges = 0
        for u in comm1:
            for v in comm2:
                if G.has_edge(u, v):
                    inter_edges += 1
        total_edges = len(comm1) * len(comm2)
        return inter_edges / total_edges if total_edges else 0

    merged_flag = True
    while merged_flag:
        merged_flag = False
        new_communities = []
        skip = set()
        for i in range(len(communities)):
            if i in skip:
                continue
            for j in range(i + 1, len(communities)):
                if j in skip:
                    continue
                conn = interconnectivity(communities[i], communities[j])
                if conn >= CONN:
                    merged = communities[i] | communities[j]
                    new_communities.append(merged)
                    skip.update({i, j})
                    merged_flag = True
                    break
            if i not in skip:
                new_communities.append(communities[i])
        communities = new_communities

    # Step 4: Assign remaining nodes
    assigned = set().union(*communities)
    unassigned = set(G.nodes()) - assigned
    for node in unassigned:
        best_comm = None
        max_conn = -1
        for comm in communities:
            conn = sum(1 for neighbor in G.neighbors(node) if neighbor in comm)
            if conn > max_conn:
                max_conn = conn
                best_comm = comm
        if best_comm is not None:
            best_comm.add(node)
        else:
            communities.append({node})  # Isolated node gets its own community

    return communities


## Constructing the user-user network

Construct it base on co-rated items.

In [9]:
from collections import defaultdict
import networkx as nx

def build_user_item_bipartite(ratings):
    B = nx.Graph()
    for _, row in ratings.iterrows():
        user = f'u{row["user_id"]}'
        item = f'i{row["item_id"]}'
        B.add_node(user, bipartite=0)
        B.add_node(item, bipartite=1)
        B.add_edge(user, item)
    return B

In [10]:
from networkx.algorithms import bipartite

def project_user_graph(B):
    users = {n for n, d in B.nodes(data=True) if d["bipartite"] == 0}
    G_user = bipartite.weighted_projected_graph(B, users)
    return G_user

In [11]:
def filter_user_graph(G_user, min_common=5):
    G_filtered = nx.Graph()

    # Add strong edges (based on co-rated items)
    for u, v, data in G_user.edges(data=True):
        if data['weight'] >= min_common:
            G_filtered.add_edge(u, v, weight=data['weight'])

    # Re-add all user nodes (even if isolated)
    #G_filtered.add_nodes_from(G_user.nodes(data=True))  # keep attributes if any

    return G_filtered

In [12]:
B = build_user_item_bipartite(ratings)
G_user = project_user_graph(B)
G_filtered = filter_user_graph(G_user, min_common=20)
print(f"Projected user-user graph has {G_filtered.number_of_nodes()} nodes and {G_filtered.number_of_edges()} edges.")

Projected user-user graph has 908 nodes and 123565 edges.


Construct it based on user tag attributes.

In [13]:
import pandas as pd
import networkx as nx

def zip_similarity(zip1, zip2):
    zip1, zip2 = str(zip1), str(zip2)
    return sum(c1 == c2 for c1, c2 in zip(zip1[:3], zip2[:3])) / 3

def demographic_similarity(u1, u2, weights):
    age_sim = max(0, 1 - abs(u1.age - u2.age) / 50)
    gender_sim = 1 if u1.gender == u2.gender else 0
    occupation_sim = 1 if u1.occupation == u2.occupation else 0

    return (weights['age'] * age_sim +
            weights['gender'] * gender_sim +
            weights['occupation'] * occupation_sim)

def build_weighted_user_graph(users, weights, threshold=0.5):
    G = nx.Graph()
    for _, row in users.iterrows():
        G.add_node(row.user_id, age=row.age, gender=row.gender,
                   occupation=row.occupation, zip_code=row.zip_code)

    for i, u1 in users.iterrows():
        for j, u2 in users.iterrows():
            if i >= j:
                continue
            sim = demographic_similarity(u1, u2, weights)
            if sim >= threshold:
                G.add_edge(u1.user_id, u2.user_id, weight=sim)
    return G


In [14]:
users = pd.read_csv("Data/ml-100k/u.user", sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

weights = {
    'age': 0.4,
    'gender': 0.3,
    'occupation': 0.3
}

G_weighted = build_weighted_user_graph(users, weights, threshold=0.7)

print(f"Weighted user graph: {G_weighted.number_of_nodes()} nodes, {G_weighted.number_of_edges()} edges")

Weighted user graph: 943 nodes, 30982 edges


In [15]:
communities = k_faction_community_detection(G_weighted, k=20, T=0.5, CONN=0.5)
print(f"Detected {len(communities)} communities")

sum_c = 0
for i, comm in enumerate(communities, 1):
    sum_c += len(comm)# Show first 5
    print("Community of size", len(comm))
    print(f"Community {i}: {sorted(comm)}")

sum_c

Detected 28 communities
Community of size 44
Community 1: [1, 4, 31, 39, 69, 73, 75, 105, 106, 132, 179, 234, 268, 293, 301, 308, 318, 349, 369, 414, 431, 435, 456, 470, 472, 480, 481, 517, 564, 567, 573, 627, 641, 644, 651, 717, 752, 832, 860, 867, 889, 900, 926, 936]
Community of size 31
Community 2: [1, 4, 44, 77, 143, 197, 211, 244, 294, 311, 325, 339, 441, 443, 456, 458, 488, 545, 670, 690, 715, 717, 718, 738, 739, 753, 790, 812, 832, 850, 889]
Community of size 59
Community 3: [2, 5, 11, 12, 18, 20, 38, 46, 120, 126, 128, 129, 149, 155, 165, 169, 236, 256, 273, 292, 316, 342, 352, 362, 373, 376, 389, 417, 418, 424, 437, 460, 505, 544, 602, 613, 617, 629, 681, 698, 713, 719, 721, 732, 733, 734, 751, 760, 797, 805, 809, 810, 835, 856, 859, 885, 907, 914, 930]
Community of size 26
Community 4: [3, 21, 22, 28, 50, 196, 199, 201, 293, 370, 385, 392, 445, 464, 498, 548, 695, 741, 745, 772, 801, 829, 833, 842, 853, 896]
Community of size 22
Community 5: [3, 33, 37, 66, 135, 255, 267, 30

1222

## Step 3: Community‑Based Collaborative Filtering

In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def corrected_cosine_similarity(matrix):
    # Subtract mean rating per user (row-wise mean)
    matrix_centered = matrix - np.nanmean(matrix, axis=1, keepdims=True)
    # Replace NaNs with 0 for similarity calculation
    matrix_centered = np.nan_to_num(matrix_centered)
    return cosine_similarity(matrix_centered)


In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_combined_similarity_to_user(user_item_matrix, user_category_matrix, target_user, k=10, lambda_=0.5):
    # Target user vectors
    target_rating_vector = user_item_matrix.loc[target_user].values.reshape(1, -1)
    target_category_vector = user_category_matrix.loc[target_user].values.reshape(1, -1)

    # Similarities
    sim_r = cosine_similarity(target_rating_vector, user_item_matrix.values)[0]
    sim_cat = cosine_similarity(target_category_vector, user_category_matrix.values)[0]

    # Set self-similarity to zero
    if target_user in user_item_matrix.index:
        idx = user_item_matrix.index.get_loc(target_user)
        sim_r[idx] = 0
        sim_cat[idx] = 0

    # Combine similarities
    combined = (1 - lambda_) * sim_r + lambda_ * sim_cat

    # Get top-k
    top_k_users = np.argsort(combined)[-k:][::-1]

    return top_k_users, combined[top_k_users]


In [18]:
x, z = compute_combined_similarity_to_user(rating_matrix_filled, binary_user_genre_matrix, 300)

In [19]:
x

array([340, 510, 431, 815, 399, 142, 303, 125, 469, 438])

In [22]:
rating_matrix_filled.shape

(943, 1682)

In [20]:
def predict_rating(user_id, item_id, communities, rating_matrix, item_category_matrix, k=10, lambda_=0.5):
    
    candidate_neighbours = set()
    for community in communities:
        if user_id in community:
            candidate_neighbours.update(community)
    
    """if not candidate_neighbours:
        # Fallback: average of non-zero ratings
        user_ratings = rating_matrix.loc[user_id]
        rated = user_ratings != 0
        return user_ratings[rated].mean() if rated.any() else 0"""
    
    candidate_neighbours = list(candidate_neighbours)

    #similar_users = rating_matrix.loc[candidate_neighbours, item_id].dropna().index
    ratings_series = rating_matrix.loc[candidate_neighbours, item_id]
    similar_users = ratings_series[ratings_series != 0].index

    if similar_users.empty:
        print(f"Item {item_id} has no ratings from similar users.")
        return None  # Cannot predict if no similar users have rated the movie
    
    selected_users = list(similar_users) + [user_id]
    print(selected_users)

    comm_user_item_matrix = rating_matrix.loc[selected_users]
    comm_item_cate_matrix = item_category_matrix.loc[selected_users]

    target_user = user_id

    top_k_rel, similarities = compute_combined_similarity_to_user(
        comm_user_item_matrix, comm_item_cate_matrix, target_user, k=k, lambda_=lambda_
    )

    comm_ids = list(comm_user_item_matrix.index)
    top_k_users = [comm_ids[i] for i in top_k_rel]

    # Get ratings for the target item
    ratings = rating_matrix.loc[top_k_users, item_id]

    weighted_sum = np.dot(ratings, similarities)
    sum_weights = np.sum(similarities)

    return weighted_sum / sum_weights

In [21]:
predict_rating(1, 20, communities, rating_matrix_filled, binary_user_genre_matrix, k=20, lambda_=0.5)

[1, 936, 458, 349, 234, 244, 1]


ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 3364 while Y.shape[1] == 1682

## Testing the approach

In [None]:
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [None]:
from sklearn.model_selection import train_test_split

# Split the data (80% train, 20% test)
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Reset index for better handling
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Check the size of each dataset
print(f"Train set size: {train_df.shape}")
print(f"Test set size: {test_df.shape}")

Train set size: (80000, 3)
Test set size: (20000, 3)


In [None]:
test_df

Unnamed: 0,user_id,item_id,rating
0,877,381,4
1,815,602,3
2,94,431,4
3,416,875,2
4,500,182,2
...,...,...,...
19995,72,591,5
19996,523,393,5
19997,606,287,4
19998,650,612,4


In [None]:
true_ratings = []
predicted_ratings = []

for idx, row in test_df.iterrows():
    user = row["user_id"]
    movie = row["item_id"]
    true_rating = row["rating"]

    pred = predict_rating(user, movie, communities, rating_matrix_filled, binary_user_genre_matrix, k=20, lambda_=0.5)

    if pred is not None:
        true_ratings.append(true_rating)
        predicted_ratings.append(pred)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 3364 while Y.shape[1] == 1682

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score

mae = mean_absolute_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

threshold = 3.5
true_binary = [1 if r >= threshold else 0 for r in true_ratings]
pred_binary = [1 if p >= threshold else 0 for p in predicted_ratings]

precision = precision_score(true_binary, pred_binary)
recall = recall_score(true_binary, pred_binary)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
