In [1]:
import numpy as np  # NumPy for numerical operations
import pandas as pd  # Pandas for data manipulation and analysis
from scipy.io import loadmat

#### Loading rating data

In [2]:
# Specify the MATLAB file path for ratings data of Epinion data set
mat_file_path = 'Data/rating.mat'

# Load the MATLAB file
mat_data = loadmat(mat_file_path)

# Assuming the MATLAB file contains a variable named 'ratings' for ratings data
ratings_data = mat_data['rating']

# Create a DataFrame from the ratings data
ratings_df = pd.DataFrame(ratings_data, columns=['user_id', 'item_id', 'cat_id', 'rating'])


#### Loading trust relationship data

In [3]:
# Specify the relationships file path of Epinion data set
trust_path = 'Data/epinions.txt'

# Define the column names for the trust dataset
columns = ["trustor_id", "trustee_id", "trust_label"]

# Read the trust dataset from the specified file
trust_df = pd.read_csv(trust_path, sep='\t', header=None, names=columns, skiprows=1, skipfooter=1, engine='python')

# Replace trust labels {-1, 1} with {0, 1}
trust_df['trust_label'] = trust_df['trust_label'].replace({-1: 0})

# Drop rows where trustee ids are greater than 22166 in the original DataFrame
trust_df.drop(trust_df[trust_df['trustee_id'] > max(ratings_df['user_id'])].index, inplace=True)

#### Specifying unique ids in the rating data and trust data

In [4]:
# Count unique user IDs in the ratings dataset
unique_user_count = ratings_df['user_id'].nunique()
print("Unique number of user IDs:", unique_user_count)

# Count unique item IDs in the ratings dataset
unique_item_count = ratings_df['item_id'].nunique()
print("Unique number of item IDs:", unique_item_count)

# Count unique trustor IDs in the trust dataset
unique_trustor_count = trust_df['trustor_id'].nunique()
print("Unique number of trustor IDs:", unique_trustor_count)

# Count unique trustee IDs in the trust dataset
unique_trustee_count = trust_df['trustee_id'].nunique()
print("Unique number of trustee IDs:", unique_trustee_count)


Unique number of user IDs: 22164
Unique number of item IDs: 296277
Unique number of trustor IDs: 1819
Unique number of trustee IDs: 8873


#### Calculating link conection similarity and item similarity as attributes (features) between nodes (trustors and trustees, for trust relations)
***using Jaccard and Adamic-Adar index***

In [5]:
# Define a function to calculate Jaccard similarity between two sets
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

# set the rated items for each user
user_item_sets = ratings_df.groupby('user_id')['item_id'].apply(set).to_dict()

# Create an empty list to store Jaccard similarities
item_similarities = []

for index, row in trust_df.iterrows():
    trustor_id = row['trustor_id']
    trustee_id = row['trustee_id']

    # Get sets of rated items for trustor and trustee
    trustor_items = user_item_sets.get(trustor_id, set())
    trustee_items = user_item_sets.get(trustee_id, set())

    # Calculate Jaccard similarity based on commen rated items
    similarity = jaccard_similarity(trustor_items, trustee_items)
    item_similarities.append(similarity)

# Add the calculated Adamic-Adar scores to the trust dataframe
trust_df['item_similarity'] = item_similarities

# Print the trust dataframe with the new column
print(trust_df[['trustor_id', 'trustee_id', 'item_similarity']])


        trustor_id  trustee_id  item_similarity
1                2           3              0.0
2                4           5              0.0
3                4         155              0.0
4                4         558              0.0
5                4        1509              0.0
...            ...         ...              ...
105055        2010       16147              0.0
105056        2010       16212              0.0
105057        2010       16331              0.0
105058        2010       16608              0.0
105059        2010       16639              0.0

[71407 rows x 3 columns]


In [6]:
import networkx as nx
from math import log

# Create a graph from the trust dataframe
G = nx.from_pandas_edgelist(trust_df, 'trustor_id', 'trustee_id')

# Calculate Adamic-Adar index for each trust relationship
adamic_adar_scores = []
for index, row in trust_df.iterrows():
    trustor_id = row['trustor_id']
    trustee_id = row['trustee_id']

    # Get common neighbors
    common_neighbors = list(nx.common_neighbors(G, trustor_id, trustee_id))

    # Calculate Adamic-Adar index
    adamic_adar_index = sum(1 / (log(G.degree(neighbor)) if G.degree(neighbor) > 1 else 1) for neighbor in common_neighbors)

    # Append the calculated index to the list
    adamic_adar_scores.append(adamic_adar_index)

# Add the calculated Adamic-Adar scores to the trust dataframe
trust_df['link_similarity'] = adamic_adar_scores

# Print the trust dataframe with the new column
print(trust_df[['trustor_id', 'trustee_id', 'link_similarity']])


        trustor_id  trustee_id  link_similarity
1                2           3         0.000000
2                4           5         0.169892
3                4         155         0.659386
4                4         558         0.194269
5                4        1509         0.238683
...            ...         ...              ...
105055        2010       16147         0.000000
105056        2010       16212         0.180337
105057        2010       16331         3.324407
105058        2010       16608         0.865039
105059        2010       16639         1.500470

[71407 rows x 3 columns]


#### Information Aggregation

In [7]:

# Generate a random weight matrix with values between 0 and 1
random_weight_matrix = np.random.rand(trust_df.shape[0], 1)

# Combine matrices using horizontal stacking
aggregated_matrix = np.hstack([trust_df['item_similarity'].values.reshape(-1, 1), 
                               trust_df['link_similarity'].values.reshape(-1, 1), 
                               random_weight_matrix])

trust_df['aggregated_similarity'] = aggregated_matrix[:, 2]

# Print the trust dataframe with the new column
print(trust_df[['trustor_id', 'trustee_id', 'aggregated_similarity']])


        trustor_id  trustee_id  aggregated_similarity
1                2           3               0.569635
2                4           5               0.510490
3                4         155               0.226366
4                4         558               0.941081
5                4        1509               0.789434
...            ...         ...                    ...
105055        2010       16147               0.751806
105056        2010       16212               0.372993
105057        2010       16331               0.492481
105058        2010       16608               0.888731
105059        2010       16639               0.065910

[71407 rows x 3 columns]


In [8]:

# Normalizing features

features_to_normalize = trust_df[['item_similarity', 'link_similarity', 'aggregated_similarity']]

# Calculate mean and std
mean = features_to_normalize.mean()
std = features_to_normalize.std()

# Normalize the features
normalized_features = (features_to_normalize - mean) / std

# Update the original DataFrame with normalized values
trust_df[['item_similarity', 'link_similarity', 'aggregated_similarity']] = normalized_features

trust_df

Unnamed: 0,trustor_id,trustee_id,trust_label,item_similarity,link_similarity,aggregated_similarity
1,2,3,1,-0.074927,-0.599191,0.240671
2,4,5,0,-0.074927,-0.571375,0.035448
3,4,155,0,-0.074927,-0.491233,-0.950400
4,4,558,1,-0.074927,-0.567384,1.529510
5,4,1509,0,-0.074927,-0.560113,1.003327
...,...,...,...,...,...,...
105055,2010,16147,1,-0.074927,-0.599191,0.872767
105056,2010,16212,1,-0.074927,-0.569665,-0.441636
105057,2010,16331,1,-0.074927,-0.054902,-0.027037
105058,2010,16608,0,-0.074927,-0.457562,1.347866


#### Develop Mirror to predict trust relationships

In [9]:
# Importing PyTorch and PyG modules, and the necessary libraries for this end
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import ChebConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [10]:
''' # defines the MIRROR model using the RGCNConv (Relational Graph Convolutional Network Convolution) layer 
      for a relational graph classification task with Autoencoder.'''

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        encoded = F.relu(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return encoded, decoded

# Define the TrustRGCNAutoencoder model with the autoencoder
class TrustRGCNAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim_ae, hidden_dim_rgcn, output_dim, num_relations, num_bases):
        super(TrustRGCNAutoencoder, self).__init__()
        self.autoencoder = Autoencoder(input_dim, hidden_dim_ae)
        self.rgcn1 = ChebConv(hidden_dim_ae, hidden_dim_rgcn, K=2, normalization='sym')
        self.rgcn2 = ChebConv(hidden_dim_rgcn, hidden_dim_rgcn, K=2, normalization='sym')
        self.linear = nn.Linear(hidden_dim_rgcn, output_dim)

    def forward(self, data):
        x, edge_index, edge_type = data.x, data.edge_index, data.edge_type
        encoded, _ = self.autoencoder(x)  # Use only the encoded features
        x = F.relu(self.rgcn1(encoded, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.rgcn2(x, edge_index))
        x = self.linear(x)
        return x


In [11]:
# prepares data for a PyTorch Geometric model

# Assuming you have unique node IDs for trustors and trustees
trustors = trust_df['trustor_id'].unique()
trustees = trust_df['trustee_id'].unique()

# Create a mapping of node IDs to indices
node_to_index = {node: index for index, node in enumerate(set(trustors) | set(trustees))}

# Map node IDs in the dataframe to indices
trust_df['trustor_index'] = trust_df['trustor_id'].map(node_to_index)
trust_df['trustee_index'] = trust_df['trustee_id'].map(node_to_index)

# Manually create edge_index and edge_type
trustor_indices = torch.tensor(trust_df['trustor_index'].values, dtype=torch.long)
trustee_indices = torch.tensor(trust_df['trustee_index'].values, dtype=torch.long)
edge_index = torch.stack([trustor_indices, trustee_indices], dim=0)
edge_type = torch.tensor(trust_df['trust_label'].values, dtype=torch.long)

# Features: Include similarity and any other relevant features
# For simplicity, assuming only 'link_similarity' is a feature
features = torch.tensor(trust_df[['item_similarity', 'link_similarity', 'aggregated_similarity']].values, dtype=torch.float)

# Labels: Assuming 'trust_label' is the target variable
labels = torch.tensor(trust_df['trust_label'].values, dtype=torch.long)

# Create a torch_geometric Data object
data = Data(x=features, edge_index=edge_index, y=labels, edge_type=edge_type)


In [12]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(trust_df, test_size=0.2, random_state=42)

# Create separate Data objects for training and testing
train_features = torch.tensor(train_data[['item_similarity', 'link_similarity', 'aggregated_similarity']].values, dtype=torch.float).reshape(-1, 3)
train_labels = torch.tensor(train_data['trust_label'].values, dtype=torch.long)
train_edge_index = torch.stack(
    [torch.tensor(train_data['trustor_index'].values, dtype=torch.long),
     torch.tensor(train_data['trustee_index'].values, dtype=torch.long)],
    dim=0
)

train_edge_type = torch.tensor(train_data['trust_label'].values, dtype=torch.long)
train_data = Data(x=train_features, edge_index=train_edge_index, y=train_labels, edge_type=train_edge_type)

test_features = torch.tensor(test_data[['item_similarity', 'link_similarity', 'aggregated_similarity']].values, dtype=torch.float).reshape(-1, 3)
test_labels = torch.tensor(test_data['trust_label'].values, dtype=torch.long)
test_edge_index = torch.stack(
    [torch.tensor(test_data['trustor_index'].values, dtype=torch.long),
     torch.tensor(test_data['trustee_index'].values, dtype=torch.long)],
    dim=0
)

test_edge_type = torch.tensor(test_data['trust_label'].values, dtype=torch.long)
test_data = Data(x=test_features, edge_index=test_edge_index, y=test_labels, edge_type=test_edge_type)


In [13]:
# Initialize the TrustRGCNAutoencoder model
input_dim_ae = features.shape[1]
num_relations = len(trust_df['trust_label'].unique())
output_dim = 1  # Binary classification (trust/distrust)
hidden_dim_ae = 64  # Adjust based on your data
hidden_dim_rgcn = 128  # Adjust based on your data
model = TrustRGCNAutoencoder(input_dim_ae, hidden_dim_ae, hidden_dim_rgcn, output_dim, num_relations, num_bases=2)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [14]:
# Train the model
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(train_data)
    
    # Modify target to match the output shape
    target = train_data.y.float().view(-1, 1)
    
    loss = criterion(out, target)
    loss.backward()
    optimizer.step()

    # Print training loss for monitoring
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


Epoch 0, Loss: 0.6968045830726624
Epoch 10, Loss: 0.47611555457115173
Epoch 20, Loss: 0.46949368715286255
Epoch 30, Loss: 0.4663015604019165
Epoch 40, Loss: 0.46648508310317993
Epoch 50, Loss: 0.4654458463191986
Epoch 60, Loss: 0.4646461606025696
Epoch 70, Loss: 0.4638225734233856
Epoch 80, Loss: 0.4640229642391205
Epoch 90, Loss: 0.4634787440299988
Epoch 100, Loss: 0.46297934651374817
Epoch 110, Loss: 0.463005006313324
Epoch 120, Loss: 0.4628426432609558
Epoch 130, Loss: 0.46257176995277405
Epoch 140, Loss: 0.4624241590499878
Epoch 150, Loss: 0.46243277192115784
Epoch 160, Loss: 0.46262502670288086
Epoch 170, Loss: 0.46221575140953064
Epoch 180, Loss: 0.46217212080955505
Epoch 190, Loss: 0.4621158242225647


In [15]:
# Evaluation on test data
model.eval()
with torch.no_grad():
    pred = model(test_data)

# Apply sigmoid to get probability scores
probabilities = torch.sigmoid(pred)

# Compute metrics
predicted_labels = ((probabilities) > 0.5) # 0.5 is the threshold
accuracy = accuracy_score(test_labels.numpy(), predicted_labels.numpy())
print(f'Accuracy: {accuracy}')

Accuracy: 0.8227839238201933
