In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import uniform, randint


In [None]:
K = 11  
RATING_FILE = 'train_data_movie_rate.csv'
TRUST_FILE  = 'train_data_movie_trust.csv'
TEST_FILE   = 'test_data.csv'  
OUT_FILE    = 'predictions.csv'


In [3]:
ratings = pd.read_csv(RATING_FILE)  
trust   = pd.read_csv(TRUST_FILE) 
test_df = pd.read_csv(TEST_FILE)   


In [4]:
trust_sorted = trust.sort_values(
    ['user_id_trustor', 'trust_value'],
    ascending=[True, False]
)
topk = trust_sorted.groupby('user_id_trustor')['user_id_trustee'] \
                   .apply(lambda lst: lst.tolist()[:K]) \
                   .to_dict()


In [5]:
rating_map = {
    (u, i): r
    for u, i, r in zip(ratings.user_id, ratings.item_id, ratings.label)
}
trust_map = {
    (u, v): tv
    for u, v, tv in zip(
        trust.user_id_trustor,
        trust.user_id_trustee,
        trust.trust_value
    )
}


In [6]:
def make_features(user_id, item_id):
    """
    For (user_id, item_id), returns a list of length 2*K:
      [r(v1,i), t(u->v1), r(v2,i), t(u->v2), …, r(vK,i), t(u->vK)]
    Missing ratings or trust edges → -1
    """
    neigh = topk.get(user_id, [])
    feats = []
    for v in neigh:
        feats.append(rating_map.get((v, item_id), -1))   
        feats.append(trust_map.get((user_id, v), -1))    
    
    if len(neigh) < K:
        pads = K - len(neigh)
        feats.extend([-1, -1] * pads)
    return feats


In [7]:
X = np.array(
    [make_features(u, i) for u, i in zip(ratings.user_id, ratings.item_id)],
    dtype=np.float32
)
y = ratings.label.values


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)


In [10]:
param_dist = {
    'learning_rate': uniform(0.01, 0.2),      # Uniform distribution over 0.01 to 0.2
    'max_depth': randint(3, 15),               # Random integer between 3 and 15
    'subsample': uniform(0.5, 0.5),            # Uniform distribution between 0.5 and 1
    'colsample_bytree': uniform(0.5, 0.5),     # Uniform distribution between 0.5 and 1
    'n_estimators': randint(50, 500),          # Random integer between 50 and 500
    'gamma': uniform(0, 1),                    # Uniform distribution between 0 and 1
    'min_child_weight': randint(1, 10)          # Random integer between 1 and 10
}

# Create XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')


In [11]:
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of different parameter combinations to try
    scoring='neg_mean_squared_error',  # Use negative mean squared error as scoring
    cv=5,   # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print(f"Best hyperparameters found: {best_params}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters found: {'colsample_bytree': 0.9871241042672051, 'gamma': 0.9959312420303291, 'learning_rate': 0.02117423093547001, 'max_depth': 5, 'min_child_weight': 7, 'n_estimators': 270, 'subsample': 0.6754575062760393}


In [12]:
# Evaluate the model on the validation set
y_pred_val = best_model.predict(X_val)

# Compute R² and MSE
r2 = r2_score(y_val, y_pred_val)
mse = mean_squared_error(y_val, y_pred_val)

print(f"R² on validation set: {r2:.4f}")
print(f"MSE on validation set: {mse:.4f}")
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")



R² on validation set: 0.0064
MSE on validation set: 0.8618
RMSE: 0.9283


In [13]:
test_feats = np.vstack([make_features(u, i) for u, i in zip(test_df.user_id, test_df.item_id)])
test_df['label'] = best_model.predict(test_feats)

test_df.reset_index(drop=True, inplace=True)
test_df['id'] = test_df.index + 1
out = test_df[['id', 'label']]
out.to_csv(OUT_FILE, index=False)
