In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


### Loading Datasets

In [2]:
movie_rate = pd.read_csv("train_data_movie_rate.csv")
movie_trust = pd.read_csv("train_data_movie_trust.csv")

### Displaying Datasets

In [3]:
print("\nMovie Ratings Info:")
print(movie_rate.info())


Movie Ratings Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34298 entries, 0 to 34297
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       34298 non-null  int64  
 1   user_id  34298 non-null  int64  
 2   item_id  34298 non-null  int64  
 3   label    34298 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 1.0 MB
None


In [4]:
print("\nMovie Trust Info:")
print(movie_trust.info())


Movie Trust Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               1853 non-null   int64
 1   user_id_trustor  1853 non-null   int64
 2   user_id_trustee  1853 non-null   int64
 3   trust_value      1853 non-null   int64
dtypes: int64(4)
memory usage: 58.0 KB
None


### Missing Values and Duplication

In [5]:
print("\nMissing values in Ratings Dataset:")
print(movie_rate.isnull().sum())


Missing values in Ratings Dataset:
id         0
user_id    0
item_id    0
label      0
dtype: int64


In [6]:
print("\nMissing values in Trust Dataset:")
print(movie_trust.isnull().sum())



Missing values in Trust Dataset:
id                 0
user_id_trustor    0
user_id_trustee    0
trust_value        0
dtype: int64


In [7]:
print("\nDuplicate rows in Ratings Dataset:", movie_rate.duplicated().sum())
print("Duplicate rows in Trust Dataset:", movie_trust.duplicated().sum())


Duplicate rows in Ratings Dataset: 0
Duplicate rows in Trust Dataset: 0


### Normalizing

In [8]:
# scaler = MinMaxScaler()
# movie_rate['label'] = scaler.fit_transform(movie_rate[['label']])
# print(movie_rate[['label']].head())

In [9]:
all_user_ids = pd.Index(
    pd.concat([
        movie_rate['user_id'],
        movie_trust['user_id_trustor'],
        movie_trust['user_id_trustee']
    ]).unique()
)
user_id_map = {id_: idx for idx, id_ in enumerate(all_user_ids)}


In [10]:
all_item_ids = pd.Index(movie_rate['item_id'].unique())
item_id_map = {id_: idx for idx, id_ in enumerate(all_item_ids)}


In [11]:
movie_rate['u'] = movie_rate['user_id'].map(user_id_map)
movie_rate['i'] = movie_rate['item_id'].map(item_id_map)

movie_trust['u'] = movie_trust['user_id_trustor'].map(user_id_map)
movie_trust['v'] = movie_trust['user_id_trustee'].map(user_id_map)


In [12]:
assert movie_rate[['u', 'i']].isnull().sum().sum() == 0
assert movie_trust[['u', 'v']].isnull().sum().sum() == 0


In [13]:
class TrustMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_factors = nn.Embedding(num_users, embedding_dim)
        self.item_factors = nn.Embedding(num_items, embedding_dim)
        self.trust_factors = nn.Embedding(num_users, embedding_dim)

    def forward(self, u, i):
        user_vec = self.user_factors(u)
        item_vec = self.item_factors(i)
        return (user_vec * item_vec).sum(1)

    def trust_score(self, u, v):
        trustor_vec = self.user_factors(u)
        trustee_vec = self.trust_factors(v)
        return (trustor_vec * trustee_vec).sum(1)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_users = len(user_id_map)
num_items = len(item_id_map)
embedding_dim = 32

model = TrustMF(num_users, num_items, embedding_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
rating_loss_fn = nn.MSELoss()
trust_loss_fn = nn.BCEWithLogitsLoss()


In [15]:
rating_users = torch.LongTensor(movie_rate['u'].values).to(device)
rating_items = torch.LongTensor(movie_rate['i'].values).to(device)
rating_labels = torch.FloatTensor(movie_rate['label'].values).to(device)

In [16]:
trust_users = torch.LongTensor(movie_trust['u'].values).to(device)
trust_others = torch.LongTensor(movie_trust['v'].values).to(device)
trust_values = torch.FloatTensor(movie_trust['trust_value'].values).to(device)


In [17]:
for epoch in range(10):
    model.train()
    optimizer.zero_grad()

    # Rating loss
    preds = model(rating_users, rating_items)
    rating_loss = rating_loss_fn(preds, rating_labels)

    # Trust loss
    trust_preds = model.trust_score(trust_users, trust_others)
    trust_loss = trust_loss_fn(trust_preds, trust_values)

    # Total loss
    total_loss = rating_loss + 0.01 * trust_loss
    total_loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}: Rating Loss={rating_loss.item():.4f}, Trust Loss={trust_loss.item():.4f}")


Epoch 1: Rating Loss=41.0748, Trust Loss=2.4973
Epoch 2: Rating Loss=39.6107, Trust Loss=2.3983
Epoch 3: Rating Loss=38.2033, Trust Loss=2.3025
Epoch 4: Rating Loss=36.8515, Trust Loss=2.2101
Epoch 5: Rating Loss=35.5541, Trust Loss=2.1210
Epoch 6: Rating Loss=34.3098, Trust Loss=2.0352
Epoch 7: Rating Loss=33.1172, Trust Loss=1.9528
Epoch 8: Rating Loss=31.9748, Trust Loss=1.8735
Epoch 9: Rating Loss=30.8809, Trust Loss=1.7974
Epoch 10: Rating Loss=29.8340, Trust Loss=1.7242


In [18]:
model.eval()
with torch.no_grad():
    # Predictions
    pred_ratings = model(rating_users, rating_items)
    
    # Calculate Mean Absolute Error (MAE)
    mae = torch.mean(torch.abs(pred_ratings - rating_labels))
    
    # Calculate Root Mean Squared Error (RMSE)
    rmse = torch.sqrt(torch.mean((pred_ratings - rating_labels) ** 2))

    # Calculate threshold-based accuracy
    threshold = 0.5  # Define a threshold for acceptable difference between actual and predicted
    accuracy = torch.mean((torch.abs(pred_ratings - rating_labels) <= threshold).float())

print(f"Mean Absolute Error (MAE): {mae.item():.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse.item():.4f}")
print(f"Accuracy (within ±{threshold}): {accuracy.item() * 100:.2f}%")

Mean Absolute Error (MAE): 4.2576
Root Mean Squared Error (RMSE): 5.3696
Accuracy (within ±0.5): 8.46%


## svd



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

# Load the datasets
movie_rate = pd.read_csv('train_data_movie_rate.csv')
movie_trust = pd.read_csv('train_data_movie_trust.csv')

# Prepare the user-item interaction matrix
pivot_table = movie_rate.pivot(index='user_id', columns='item_id', values='label')
pivot_table = pivot_table.fillna(0)  # Fill missing values with 0

# Convert pivot table to a matrix form for matrix factorization
R = pivot_table.values
user_item_matrix = np.array(R)

# Perform SVD on the ratings matrix (Matrix Factorization)
svd = TruncatedSVD(n_components=50, random_state=42)  # You can tune the number of components
svd_matrix = svd.fit_transform(user_item_matrix)
print(f"SVD explained variance ratio: {svd.explained_variance_ratio_[:5]}")  # Print first 5 components

# Reconstruct the ratings matrix using SVD results
reconstructed_matrix = svd.inverse_transform(svd_matrix)

# Evaluate the performance of SVD reconstruction
mse = mean_squared_error(user_item_matrix, reconstructed_matrix)
rmse = np.sqrt(mse)
print(f"Reconstruction RMSE: {rmse:.4f}")


ValueError: Index contains duplicate entries, cannot reshape

In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Create a DataFrame for trust relationships and user-item interaction
movie_trust['trust_value'] = movie_trust['trust_value'].fillna(0)  # Fill missing trust values
movie_rate = movie_rate.rename(columns={'user_id': 'user_id_trustor', 'item_id': 'item_id_rate', 'label': 'rating_value'})

# Merge trust data with ratings data
df = pd.merge(movie_rate, movie_trust, how='left', left_on='user_id_trustor', right_on='user_id_trustor')

# Prepare the dataset for training the machine learning model
X = df[['user_id_trustor', 'item_id_rate', 'trust_value']]  # Features: user, item, trust
y = df['rating_value']  # Target: ratings

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix (XGBoost's optimized data structure)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set hyperparameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
preds = model.predict(dtest)

# Calculate RMSE on the test set
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Test RMSE (XGBoost): {rmse:.4f}")

# Calculate MAE
mae = mean_absolute_error(y_test, preds)
print(f"Test MAE (XGBoost): {mae:.4f}")



Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Test RMSE (XGBoost): 0.7931
Test MAE (XGBoost): 0.6215


In [None]:
# Already predicted: preds = model.predict(dtest)

# Calculate accuracy (within ±0.5)
tolerance = 0.5
correct_predictions = np.abs(preds - y_test.values) <= tolerance
accuracy = np.mean(correct_predictions) * 100  # in percentage

print(f"Accuracy (within ±0.5): {accuracy:.2f}%")


Accuracy (within ±0.5): 47.42%


: 