<a href="https://colab.research.google.com/github/GinuraAdikari/InsightHive/blob/Recommendation_Engine/metaPath2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install networkx gensim



In [None]:
import networkx as nx
import pandas as pd

# Load the dataset
features = pd.read_csv('drive/MyDrive/Colab Notebooks/features.csv')

# Verify dataset structure
print(features.head())


   visitorid  itemid  time_between_interaction  item_count  property_count  \
0          2  342816                         0           1               0   
1          6   65273                         0           1               0   
2         88   58827                         0           1               0   
3        120  153339                         0           1               0   
4        137  383819                         0           1               0   

   category_count  avg_interaction_time  time_on_platform  \
0               0          1.438971e+12                 0   
1               0          1.440915e+12                 0   
2               0          1.442087e+12                 0   
3               0          1.436024e+12                 0   
4               0          1.433149e+12                 0   

   time_between_actions  event_count  conversion rate  interaction_count_norm  \
0            40873501.5            1              0.0                     0.1   
1 

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets (e.g., 80% train, 20% test)
train_data, test_data = train_test_split(features, test_size=0.2, random_state=42)  # random_state for reproducibility

In [None]:
# Initialize a graph
G = nx.Graph()

# Add user nodes
users = features['visitorid'].unique()
G.add_nodes_from(users, node_type='user')

# Add item nodes
items = features['itemid'].unique()
G.add_nodes_from(items, node_type='item')

# Add edges with various features
for _, row in features.iterrows():
    user_id = row['visitorid']
    item_id = row['itemid']

    # Direct Interaction Edges
    G.add_edge(user_id, item_id, edge_type='interacts', weight=row['interaction_count_norm'])
    G.add_edge(user_id, item_id, edge_type='time_between', weight=row['time_between_interaction'])
    G.add_edge(user_id, item_id, edge_type='avg_interaction_time', weight=row['avg_interaction_time'])

    # Indirect Relationship Edges (via Item Properties)
    G.add_edge(user_id, item_id, edge_type='item_preference', weight=row['item_count'])
    G.add_edge(user_id, item_id, edge_type='property_preference', weight=row['property_count'])
    G.add_edge(user_id, item_id, edge_type='category_preference', weight=row['category_count'])
    G.add_edge(user_id, item_id, edge_type='popularity_based', weight=row['item_popularity_score_norm'])

# (Optional) Add user node attributes for user behavior:
# for _, row in features.iterrows():
#     user_id = row['visitorid']
#     G.nodes[user_id]['time_on_platform'] = row['time_on_platform']
#     G.nodes[user_id]['time_between_actions'] = row['time_between_actions']
#     G.nodes[user_id]['event_count'] = row['event_count']
#     G.nodes[user_id]['conversion_rate'] = row['conversion rate']



In [None]:
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

Number of nodes: 134638
Number of edges: 97977


In [None]:
import random

# Define a function for meta-path-based random walks
def generate_random_walks(graph, start_node, meta_path, walk_length):
    walk = [start_node]
    current_node = start_node

    for i in range(walk_length - 1):
        neighbors = list(graph.neighbors(current_node))

        # Filter neighbors based on the meta-path constraints
        next_nodes = [
            neighbor for neighbor in neighbors
            if graph.nodes[neighbor]['node_type'] == meta_path[len(walk) % len(meta_path)]
        ]

        if not next_nodes:
            break

        # Choose a random next node
        current_node = random.choice(next_nodes)
        walk.append(current_node)

    return walk

# Define a meta-path (e.g., user → item → user)
meta_path = ['user', 'item', 'user']

# Generate random walks for each user node
walks = []
for user in users:
    for _ in range(10):  # Number of walks per node
        walks.append(generate_random_walks(G, user, meta_path, walk_length=10))

# Save walks for training
with open('random_walks.txt', 'w') as f:
    for walk in walks:
        f.write(' '.join(map(str, walk)) + '\n')


In [None]:
from gensim.models import Word2Vec

# Train Metapath2Vec
model = Word2Vec(corpus_file='random_walks.txt', vector_size=128, window=5, min_count=1, sg=1, workers=4)

# Save embeddings
model.wv.save_word2vec_format('metapath2vec_embeddings.txt')

# Example: Get the embedding for a user node
user_embedding = model.wv['2']  # Replace '12345' with a specific user ID
print(user_embedding)


[ 5.2145794e-03 -2.3036450e-03  3.4334564e-03  5.9358333e-04
 -6.5487125e-03  5.0616534e-03  7.3793111e-03  6.0154055e-03
  5.4838955e-03  6.6809310e-03 -6.2669902e-03 -1.2769230e-03
  5.7831610e-04 -1.0743703e-03  6.8364716e-03  9.1963438e-03
  8.6559996e-04  4.1008173e-03  4.5293239e-03  4.8663872e-03
  8.3552357e-03  6.3381302e-03  1.5802862e-03 -3.7694902e-03
  3.1492926e-04  7.9178158e-03  3.1649999e-03 -1.0929053e-03
  4.9517569e-03 -5.7838438e-03  6.5548075e-03 -9.7040343e-04
 -7.0426352e-03 -1.2505563e-03 -7.9454705e-03 -3.0279467e-03
  5.5028107e-03  1.2154589e-05 -9.0021006e-04 -7.5091985e-03
 -3.4678469e-03  5.8737057e-03 -2.8859708e-03 -1.0207070e-02
  2.4564913e-04  6.1017321e-03  7.1217446e-03 -4.8111039e-03
  4.3664915e-03 -9.6677458e-03 -2.8832490e-03  6.0047363e-03
  1.9921935e-03 -4.5526335e-03 -4.0819845e-03  7.0850411e-03
 -4.5934706e-03  9.1185932e-05 -2.0699122e-03  6.3867513e-03
 -9.6924258e-03 -4.4489000e-03 -9.3249884e-03 -9.0629701e-03
  3.1978930e-03 -1.39590

In [None]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Get embeddings for two users (replace with your actual user IDs)
user1_embedding = model.wv['2']
user2_embedding = model.wv['88']

# Calculate cosine similarity
similarity = cosine_similarity(user1_embedding, user2_embedding)
print(f"Cosine similarity between user1 and user2: {similarity}")

Cosine similarity between user1 and user2: 0.0655808076262474


In [None]:
import numpy as np

# Assuming you have already trained the Metapath2Vec model and saved it as 'model'
# and you have a DataFrame named 'features' containing user-item interactions.

def generate_recommendations(target_users, data, num_neighbors=10):
    """
    Generates recommendations for a set of target users.

    Args:
        target_users (list): List of target user IDs.
        data (pd.DataFrame): The DataFrame containing user-item interactions (train_data).
        num_neighbors (int): Number of nearest neighbors to consider.

    Returns:
        dict: A dictionary mapping target user IDs to lists of recommended item IDs.
    """

    recommendations = {}
    for target_user_id in target_users:
        # 1. Calculate Similarities
        target_embedding = model.wv[str(target_user_id)]  # Convert user ID to string if necessary
        similarities = {}
        for user_id in model.wv.index_to_key:
            if user_id != str(target_user_id):  # Convert user IDs for comparison
                similarities[user_id] = cosine_similarity(target_embedding, model.wv[user_id])

        # 2. Select Nearest Neighbors
        nearest_neighbors = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:num_neighbors]

        # 3. Generate Recommendations
        recommended_items = []
        for neighbor_id, _ in nearest_neighbors:
            # Get items interacted with by neighbor
            neighbor_id_int = int(float(neighbor_id))  # Convert to float first, then to int
            neighbor_items = get_items_for_user(neighbor_id_int, data)  # Pass data (train_data) to get_items_for_user
            # Add items not interacted with by target user to recommendations
            recommended_items.extend([item for item in neighbor_items if item not in get_items_for_user(target_user_id, data)])  # Pass data (train_data) to get_items_for_user

        recommendations[target_user_id] = recommended_items

    return recommendations


def cosine_similarity(a, b):
    """Calculates the cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_items_for_user(user_id, data):  # Add data argument
    """Retrieves the items interacted with by a given user from the specified data."""
    user_items = data[data['visitorid'] == user_id]['itemid'].tolist()
    return user_items

# Example Usage (assuming 'features' is your DataFrame)
target_users = [2, 6, 88]  # Replace with your target user IDs
recommendations = generate_recommendations(target_users, train_data)  # Pass train_data to generate_recommendations

for user_id, recommended_items in recommendations.items():
    print(f"Recommendations for user {user_id}: {recommended_items}")

Recommendations for user 2: [98412, 54124, 414939, 339763, 60139, 17478, 428891, 88886, 263741, 16949, 420228, 185712, 190487]
Recommendations for user 6: [57841, 339665, 314642, 187511]
Recommendations for user 88: [49466, 289051, 392004, 444951, 228844, 77233, 76570, 367664]
