In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint
import numpy as np

In [4]:
ratings_df = pd.read_csv(r"train_data_movie_rate.csv")
trust_df = pd.read_csv(r"train_data_movie_trust.csv")

test_df = pd.read_csv(r"test_data.csv")

In [5]:
ratings_df

Unnamed: 0,id,user_id,item_id,label
0,1,1,1,2.0
1,2,1,2,4.0
2,3,1,3,3.5
3,4,1,4,3.0
4,5,1,5,4.0
...,...,...,...,...
34293,34294,1508,84,3.5
34294,34295,1508,17,4.0
34295,34296,1508,669,1.0
34296,34297,1508,686,2.5


In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34298 entries, 0 to 34297
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       34298 non-null  int64  
 1   user_id  34298 non-null  int64  
 2   item_id  34298 non-null  int64  
 3   label    34298 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 1.0 MB


In [7]:
duplicate_counts = test_df.groupby(['user_id', 'item_id']).size().reset_index(name='count')

duplicates = duplicate_counts[duplicate_counts['count'] > 1]

duplicate_pairs = duplicates[['user_id', 'item_id']]
duplicate_rows = test_df.merge(duplicate_pairs, on=['user_id', 'item_id'], how='inner')

print(duplicate_rows)

       id  user_id  item_id
0       9       26      215
1      10       26      215
2      12       29      294
3      13       29      218
4      14       29      293
..    ...      ...      ...
945  1695      320        2
946  1696      320        7
947  1699      320      236
948  1701      320       17
949  1702      320      211

[950 rows x 3 columns]


In [8]:
duplicate_counts = ratings_df.groupby(['user_id', 'item_id']).size().reset_index(name='count')

duplicates = duplicate_counts[duplicate_counts['count'] > 1]

duplicate_pairs = duplicates[['user_id', 'item_id']]
duplicate_rows = ratings_df.merge(duplicate_pairs, on=['user_id', 'item_id'], how='inner')

print(duplicate_rows)

     id  user_id  item_id  label
0  6356      308      207    3.5
1  6357      308      235    4.0
2  6359      308       12    4.0
3  6382      308      207    3.0
4  6413      308      235    1.5
5  6434      308       12    4.0


In [9]:
ratings_df = ratings_df.groupby(['user_id', 'item_id'], as_index=False)['label'].mean()

In [10]:
trustor_dict = trust_df.groupby("user_id_trustor")["user_id_trustee"].apply(set).to_dict()

trustee_dict = trust_df.groupby("user_id_trustee")["user_id_trustor"].apply(set).to_dict()

item_raters = ratings_df.groupby("item_id")["user_id"].apply(set).to_dict()

In [11]:

user_avg = ratings_df.groupby("user_id")["label"].mean().rename("user_avg_rating")
item_avg = ratings_df.groupby("item_id")["label"].mean().rename("item_avg_rating")
user_count = ratings_df.groupby("user_id")["label"].count().rename("user_count_rating")
item_count = ratings_df.groupby("item_id")["label"].count().rename("item_count_rating")
user_std = ratings_df.groupby("user_id")["label"].std().rename("user_std_rating").fillna(0)
item_std = ratings_df.groupby("item_id")["label"].std().rename("item_std_rating").fillna(0)
user_max = ratings_df.groupby("user_id")["label"].max().rename("user_max_rating")
item_max = ratings_df.groupby("item_id")["label"].max().rename("item_min_rating")
user_min = ratings_df.groupby("user_id")["label"].min().rename("user_min_rating")
item_min = ratings_df.groupby("item_id")["label"].min().rename("item_max_rating")


for name, df in [("ratings_df", ratings_df), ("test_df", test_df)]:
    df = df.set_index(["user_id", "item_id"])
    df = df.join(user_avg, on="user_id")
    df = df.join(item_avg, on="item_id")
    df = df.join(user_count, on="user_id")
    df = df.join(item_count, on="item_id")
    df = df.join(user_std, on="user_id")
    df = df.join(item_std, on="item_id")
    df = df.join(user_max, on="user_id")
    df = df.join(user_min, on="user_id")
    df = df.join(item_max, on="item_id")
    df = df.join(item_min, on="item_id")
    df = df.reset_index()
    
    if name == "ratings_df":
        ratings_df = df
    else:
        test_df = df

for df in [ratings_df, test_df]:
    df.fillna({
        "user_std_rating": 0,
        "item_std_rating": 0,
    }, inplace=True)



In [12]:
def calc_user_percentile(user_id, label, df=ratings_df):
    user_ratings = df[df["user_id"] == user_id]["label"]
    return (user_ratings < label).mean() if len(user_ratings) > 1 else 0.5

ratings_df["user_rating_percentile"] = ratings_df.apply(
    lambda row: calc_user_percentile(row["user_id"], row["label"]), axis=1)

test_df["user_rating_percentile"] = test_df.apply(
    lambda row: calc_user_percentile(row["user_id"], user_avg.get(row["user_id"], 2.5)), axis=1)

ratings_df["user_to_item_bias_ratio"] = ratings_df["user_avg_rating"] / (ratings_df["item_avg_rating"] + 1e-5)
test_df["user_to_item_bias_ratio"] = test_df["user_avg_rating"] / (test_df["item_avg_rating"] + 1e-5)

ratings_df["user_interaction_ratio"] = ratings_df["user_count_rating"] / ratings_df["item_count_rating"]
test_df["user_interaction_ratio"] = test_df["user_count_rating"] / test_df["item_count_rating"]

item_avg_count = ratings_df["item_count_rating"].mean()
ratings_df["is_item_famous"] = (ratings_df["item_count_rating"] > item_avg_count).astype(int)
test_df["is_item_famous"] = (test_df["item_count_rating"] > item_avg_count).astype(int)

item_avg_rating = ratings_df["item_avg_rating"].mean()
ratings_df["is_item_popular"] = (ratings_df["item_avg_rating"] > item_avg_rating).astype(int)
test_df["is_item_popular"] = (test_df["item_avg_rating"] > item_avg_rating).astype(int)

ratings_df["rating_diff_from_user_avg"] = ratings_df["label"] - ratings_df["user_avg_rating"]
test_df["rating_diff_from_user_avg"] = 0  

ratings_df["rating_diff_from_item_avg"] = ratings_df["label"] - ratings_df["item_avg_rating"]
test_df["rating_diff_from_item_avg"] = 0

# item_avg_count = ratings_df["item_count_rating"].mean()
# ratings_df["item_viewers_more_than_avg"] = (ratings_df["item_count_rating"] - item_avg_count)
# test_df["item_viewers_more_than_avg"] = (test_df["item_count_rating"] - item_avg_count)

# item_avg_rating = ratings_df["item_avg_rating"].mean()
# ratings_df["item_rated_more_than_Avg"] = (ratings_df["item_avg_rating"] - item_avg_rating)
# test_df["item_rated_more_than_Avg"] = (test_df["item_avg_rating"] - item_avg_rating)

user_avg_count = ratings_df["user_count_rating"].mean()
ratings_df["is_user_movie_person"] = (ratings_df["user_count_rating"] > user_avg_count).astype(int)
test_df["is_user_movie_person"] = (test_df["user_count_rating"] > user_avg_count).astype(int)

user_avg_rating = ratings_df["user_avg_rating"].mean()
ratings_df["is_user_high_rater"] = (ratings_df["user_avg_rating"] > user_avg_rating).astype(int)
test_df["is_user_high_rater"] = (test_df["user_avg_rating"] > user_avg_rating).astype(int)

# user_avg_count = ratings_df["user_count_rating"].mean()
# ratings_df["user_views_more_than_avg"] = (ratings_df["user_count_rating"] - user_avg_count)
# test_df["user_views_more_than_avg"] = (test_df["user_count_rating"] - user_avg_count)

# user_avg_rating = ratings_df["user_avg_rating"].mean()
# ratings_df["user_rate_more_than_Avg"] = (ratings_df["user_avg_rating"] - user_avg_rating)
# test_df["user_rate_more_than_Avg"] = (test_df["user_avg_rating"] - user_avg_rating)

ratings_df["is_user_strict"] = (ratings_df["user_avg_rating"] < 2.5).astype(int)
test_df["is_user_strict"] = (test_df["user_avg_rating"] < 2.5).astype(int)

ratings_df["user_bias"] = ratings_df["user_avg_rating"] - ratings_df["item_avg_rating"]
test_df["user_bias"] = test_df["user_avg_rating"] - test_df["item_avg_rating"]


In [13]:
user_strictness = ratings_df.groupby("user_id")["label"].mean().rename("strictness")

user_strictness_binary = (user_strictness >= 2.5).astype(int).rename("strictness_bin")


def avg_strict_bin(users):
    values = [user_strictness_binary.get(u) for u in users if u in user_strictness_binary]
    return sum(values) / len(values) if values else -1


def trustor_strictness(user_id):
    return avg_strict_bin(trustor_dict.get(user_id, set()))

def trustee_strictness(user_id):
    return avg_strict_bin(trustee_dict.get(user_id, set()))

def trustor_strictness_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    seen = item_raters.get(item_id, set())
    return avg_strict_bin(trustors & seen)

def trustee_strictness_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    seen = item_raters.get(item_id, set())
    return avg_strict_bin(trustees & seen)


#for df in [ratings_df, test_df]:
    # df["trustor_strict"] = df["user_id"].apply(trustor_strictness)
    # df["trustee_strict"] = df["user_id"].apply(trustee_strictness)
    # df["trustor_strict_for_movie"] = df.apply(
    #     lambda row: trustor_strictness_for_movie(row["user_id"], row["item_id"]), axis=1)
    # df["trustee_strict_for_movie"] = df.apply(
    #     lambda row: trustee_strictness_for_movie(row["user_id"], row["item_id"]), axis=1)


In [14]:
# def calc_trustee_coverage(user_id, item_id):
#     trustees = trustee_dict.get(user_id, set())
#     raters = item_raters.get(item_id, set())
#     if not trustees:
#         return 0.0
#     return len(trustees & raters) / len(trustees)

# def calc_trustor_coverage(user_id, item_id):
#     trustors = trustor_dict.get(user_id, set())
#     raters = item_raters.get(item_id, set())
#     if not trustors:
#         return 0.0
#     return len(trustors & raters) / len(trustors)


# # ratings_df
# ratings_df["trustee_coverage"] = ratings_df.apply(
#     lambda row: calc_trustee_coverage(row["user_id"], row["item_id"]), axis=1
# )
# ratings_df["trustor_coverage"] = ratings_df.apply(
#     lambda row: calc_trustor_coverage(row["user_id"], row["item_id"]), axis=1
# )

# # test_df
# test_df["trustee_coverage"] = test_df.apply(
#     lambda row: calc_trustee_coverage(row["user_id"], row["item_id"]), axis=1
# )
# test_df["trustor_coverage"] = test_df.apply(
#     lambda row: calc_trustor_coverage(row["user_id"], row["item_id"]), axis=1
# )


In [15]:
ratings_df

Unnamed: 0,user_id,item_id,label,user_avg_rating,item_avg_rating,user_count_rating,item_count_rating,user_std_rating,item_std_rating,user_max_rating,...,user_to_item_bias_ratio,user_interaction_ratio,is_item_famous,is_item_popular,rating_diff_from_user_avg,rating_diff_from_item_avg,is_user_movie_person,is_user_high_rater,is_user_strict,user_bias
0,1,1,2.0,3.416667,2.978339,12,831,0.668558,0.855957,4.0,...,1.147168,0.014440,1,0,-1.416667,-0.978339,0,1,0,0.438327
1,1,2,4.0,3.416667,3.190286,12,875,0.668558,0.830323,4.0,...,1.070956,0.013714,1,1,0.583333,0.809714,0,1,0,0.226381
2,1,3,3.5,3.416667,3.045519,12,703,0.668558,0.821318,4.0,...,1.121863,0.017070,1,1,0.083333,0.454481,0,1,0,0.371147
3,1,4,3.0,3.416667,3.192969,12,640,0.668558,0.907212,4.0,...,1.070056,0.018750,1,1,-0.416667,-0.192969,0,1,0,0.223698
4,1,5,4.0,3.416667,3.230030,12,676,0.668558,0.802293,4.0,...,1.057779,0.017751,1,1,0.583333,0.769970,0,1,0,0.186637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34290,1508,669,1.0,2.844828,3.250000,29,4,1.018608,1.500000,4.0,...,0.875329,7.250000,0,1,-1.844828,-2.250000,0,0,0,-0.405172
34291,1508,686,2.5,2.844828,2.400000,29,5,1.018608,0.894427,4.0,...,1.185340,5.800000,0,0,-0.344828,0.100000,0,0,0,0.444828
34292,1508,693,3.5,2.844828,2.687500,29,8,1.018608,1.066955,4.0,...,1.058537,3.625000,0,0,0.655172,0.812500,0,0,0,0.157328
34293,1508,751,1.0,2.844828,1.000000,29,3,1.018608,0.500000,4.0,...,2.844799,9.666667,0,0,-1.844828,0.000000,0,0,0,1.844828


In [16]:
def trustor_count_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return 0
    seen = ratings_df[(ratings_df["user_id"].isin(trustors)) & (ratings_df["item_id"] == item_id)]
    return seen["user_id"].nunique()

def trustor_std_rating_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1 
    ratings = ratings_df[(ratings_df["user_id"].isin(trustors)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1 
    if len(ratings) == 1:
        return 0
    return ratings.std()

def trustor_avg_rating_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1
    ratings = ratings_df[(ratings_df["user_id"].isin(trustors)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.mean()

def trustor_max_rating_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1  
    ratings = ratings_df[ (ratings_df["user_id"].isin(trustors)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.max() 

def trustor_min_rating_for_movie(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1 
    ratings = ratings_df[ (ratings_df["user_id"].isin(trustors)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.min() 

#ratings_df
ratings_df["trustor_count_for_movie"] = ratings_df.apply(
    lambda row: trustor_count_for_movie(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustor_avg_rating_for_movie"] = ratings_df.apply(
    lambda row: trustor_avg_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustor_std_rating_for_movie"] = ratings_df.apply(
    lambda row: trustor_std_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustor_max_rating_for_movie"] = ratings_df.apply(
#     lambda row: trustor_max_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustor_min_rating_for_movie"] = ratings_df.apply(
#     lambda row: trustor_min_rating_for_movie(row["user_id"], row["item_id"]), axis=1)


#test_df
test_df["trustor_count_for_movie"] = test_df.apply(
    lambda row: trustor_count_for_movie(row["user_id"], row["item_id"]), axis=1)

test_df["trustor_avg_rating_for_movie"] = test_df.apply(
    lambda row: trustor_avg_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

test_df["trustor_std_rating_for_movie"] = test_df.apply(
    lambda row: trustor_std_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# test_df["trustor_max_rating_for_movie"] = test_df.apply(
#     lambda row: trustor_max_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# test_df["trustor_min_rating_for_movie"] = test_df.apply(
#     lambda row: trustor_min_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

In [17]:
def trustee_count_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return 0
    seen = ratings_df[(ratings_df["user_id"].isin(trustees)) & (ratings_df["item_id"] == item_id)]
    return seen["user_id"].nunique()

def trustee_std_rating_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[(ratings_df["user_id"].isin(trustees)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    if len(ratings) == 1:
        return 0
    return ratings.std()

def trustee_avg_rating_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[(ratings_df["user_id"].isin(trustees)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.mean() 

def trustee_max_rating_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[(ratings_df["user_id"].isin(trustees)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.max() 

def trustee_min_rating_for_movie(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[(ratings_df["user_id"].isin(trustees)) & (ratings_df["item_id"] == item_id)]["label"]
    if ratings.empty:
        return -1
    return ratings.min() 


# ratings_df
ratings_df["trustee_count_for_movie"] = ratings_df.apply(
    lambda row: trustee_count_for_movie(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustee_avg_rating_for_movie"] = ratings_df.apply(
    lambda row: trustee_avg_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustee_std_rating_for_movie"] = ratings_df.apply(
    lambda row: trustee_std_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustee_max_rating_for_movie"] = ratings_df.apply(
#     lambda row: trustee_max_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustee_min_rating_for_movie"] = ratings_df.apply(
#     lambda row: trustee_min_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# test_df
test_df["trustee_count_for_movie"] = test_df.apply(
    lambda row: trustee_count_for_movie(row["user_id"], row["item_id"]), axis=1)

test_df["trustee_avg_rating_for_movie"] = test_df.apply(
    lambda row: trustee_avg_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

test_df["trustee_std_rating_for_movie"] = test_df.apply(
    lambda row: trustee_std_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# test_df["trustee_max_rating_for_movie"] = test_df.apply(
#     lambda row: trustee_max_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

# test_df["trustee_min_rating_for_movie"] = test_df.apply(
#     lambda row: trustee_min_rating_for_movie(row["user_id"], row["item_id"]), axis=1)

In [None]:
def trustor_count(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return 0
    seen = ratings_df[ratings_df["user_id"].isin(trustors)]
    return seen["user_id"].nunique()

def trustor_std_rating(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1  
    ratings = ratings_df[ratings_df["user_id"].isin(trustors)]["label"]
    if ratings.empty:
        return -1
    if len(ratings) == 1:
        return 0
    return ratings.std()

def trustor_avg_rating(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1  
    ratings = ratings_df[ratings_df["user_id"].isin(trustors)]["label"]
    if ratings.empty:
        return -1
    return ratings.mean()

def trustor_max_rating(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1 
    ratings = ratings_df[ratings_df["user_id"].isin(trustors)]["label"]
    if ratings.empty:
        return -1
    return ratings.max() 

def trustor_min_rating(user_id, item_id):
    trustors = trustor_dict.get(user_id, set())
    if not trustors:
        return -1  
    ratings = ratings_df[ratings_df["user_id"].isin(trustors)]["label"]
    if ratings.empty:
        return -1
    return ratings.min() 

#ratings_df
ratings_df["trustor_count"] = ratings_df.apply(
    lambda row: trustor_count(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustor_avg_rating"] = ratings_df.apply(
    lambda row: trustor_avg_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustor_std_rating"] = ratings_df.apply(
#     lambda row: trustor_std_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustor_max_rating"] = ratings_df.apply(
#     lambda row: trustor_max_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustor_min_rating"] = ratings_df.apply(
#     lambda row: trustor_min_rating(row["user_id"], row["item_id"]), axis=1)

#test_df
test_df["trustor_count"] = test_df.apply(
    lambda row: trustor_count(row["user_id"], row["item_id"]), axis=1)

test_df["trustor_avg_rating"] = test_df.apply(
    lambda row: trustor_avg_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustor_std_rating"] = test_df.apply(
#     lambda row: trustor_std_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustor_max_rating"] = test_df.apply(
#     lambda row: trustor_max_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustor_min_rating"] = test_df.apply(
#     lambda row: trustor_min_rating(row["user_id"], row["item_id"]), axis=1)

In [None]:
def trustee_count(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return 0
    seen = ratings_df[ratings_df["user_id"].isin(trustees)]
    return seen["user_id"].nunique()

def trustee_std_rating(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[ratings_df["user_id"].isin(trustees)]["label"]
    if ratings.empty:
        return -1
    if len(ratings) == 1:
        return 0
    return ratings.std()

def trustee_avg_rating(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[ratings_df["user_id"].isin(trustees)]["label"]
    if ratings.empty:
        return -1
    return ratings.mean() 

def trustee_max_rating(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[ratings_df["user_id"].isin(trustees)]["label"]
    if ratings.empty:
        return -1
    return ratings.max() 

def trustee_min_rating(user_id, item_id):
    trustees = trustee_dict.get(user_id, set())
    if not trustees:
        return -1
    ratings = ratings_df[ratings_df["user_id"].isin(trustees)]["label"]
    if ratings.empty:
        return -1
    return ratings.min() 


# ratings_df
ratings_df["trustee_count"] = ratings_df.apply(
    lambda row: trustee_count(row["user_id"], row["item_id"]), axis=1)

ratings_df["trustee_avg_rating"] = ratings_df.apply(
    lambda row: trustee_avg_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustee_std_rating"] = ratings_df.apply(
#     lambda row: trustee_std_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustee_max_rating"] = ratings_df.apply(
#     lambda row: trustee_max_rating(row["user_id"], row["item_id"]), axis=1)

# ratings_df["trustee_min_rating"] = ratings_df.apply(
#     lambda row: trustee_min_rating(row["user_id"], row["item_id"]), axis=1)

# test_df
test_df["trustee_count"] = test_df.apply(
    lambda row: trustee_count(row["user_id"], row["item_id"]), axis=1)

test_df["trustee_avg_rating"] = test_df.apply(
    lambda row: trustee_avg_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustee_std_rating"] = test_df.apply(
#     lambda row: trustee_std_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustee_max_rating"] = test_df.apply(
#     lambda row: trustee_max_rating(row["user_id"], row["item_id"]), axis=1)

# test_df["trustee_min_rating"] = test_df.apply(
#     lambda row: trustee_min_rating(row["user_id"], row["item_id"]), axis=1)

In [None]:
ratings_df["trustor_avg_rating_bias"] = ratings_df["user_avg_rating"] - ratings_df["trustor_avg_rating"]
test_df["trustor_avg_rating_bias"] = test_df["user_avg_rating"] - test_df["trustor_avg_rating"]

ratings_df["trustee_avg_rating_bias"] = ratings_df["user_avg_rating"] - ratings_df["trustee_avg_rating"]
test_df["trustee_avg_rating_bias"] = test_df["user_avg_rating"] - test_df["trustee_avg_rating"]

ratings_df["trustor_coverage_item_seen"] = ratings_df["trustor_count_for_movie"] / (ratings_df["trustor_count"] + 1e-5)
test_df["trustor_coverage_item_seen"] = test_df["trustor_count_for_movie"] / (test_df["trustor_count"] + 1e-5)

ratings_df["trustee_coverage_item_seen"] = ratings_df["trustee_count_for_movie"] / (ratings_df["trustee_count"] + 1e-5)
test_df["trustee_coverage_item_seen"] = test_df["trustee_count_for_movie"] / (test_df["trustee_count"] + 1e-5)

ratings_df["user_weighted_trust"] = ratings_df["trustor_count_for_movie"] * ratings_df["user_avg_rating"]
test_df["user_weighted_trust"] = test_df["trustor_count_for_movie"] * test_df["user_avg_rating"]

ratings_df["user_avg_similar_item_rating"] = ratings_df.groupby("user_id")["item_avg_rating"].transform("mean")
test_df["user_avg_similar_item_rating"] = test_df.groupby("item_id")["user_avg_rating"].transform("mean")

ratings_df["item_avg_active_user_interaction"] = ratings_df.groupby("user_id")["item_avg_rating"].transform("mean")
test_df["item_avg_active_user_interaction"] = test_df.groupby("item_id")["user_avg_rating"].transform("mean")

In [None]:
ratings_df

Unnamed: 0,user_id,item_id,label,user_avg_rating,item_avg_rating,user_count_rating,item_count_rating,user_std_rating,item_std_rating,user_max_rating,...,trustor_avg_rating,trustee_count,trustee_avg_rating,trustor_avg_rating_bias,trustee_avg_rating_bias,trustor_coverage_item_seen,trustee_coverage_item_seen,user_weighted_trust,user_avg_similar_item_rating,item_avg_active_user_interaction
0,1,1,2.0,3.416667,2.978339,12,831,0.668558,0.855957,4.0,...,-1.000000,0,-1.00000,4.416667,4.416667,0.000000,0.000000,0.000000,3.108422,3.108422
1,1,2,4.0,3.416667,3.190286,12,875,0.668558,0.830323,4.0,...,-1.000000,0,-1.00000,4.416667,4.416667,0.000000,0.000000,0.000000,3.108422,3.108422
2,1,3,3.5,3.416667,3.045519,12,703,0.668558,0.821318,4.0,...,-1.000000,0,-1.00000,4.416667,4.416667,0.000000,0.000000,0.000000,3.108422,3.108422
3,1,4,3.0,3.416667,3.192969,12,640,0.668558,0.907212,4.0,...,-1.000000,0,-1.00000,4.416667,4.416667,0.000000,0.000000,0.000000,3.108422,3.108422
4,1,5,4.0,3.416667,3.230030,12,676,0.668558,0.802293,4.0,...,-1.000000,0,-1.00000,4.416667,4.416667,0.000000,0.000000,0.000000,3.108422,3.108422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34290,1508,669,1.0,2.844828,3.250000,29,4,1.018608,1.500000,4.0,...,2.738872,2,2.72381,0.105955,0.121018,0.000000,0.000000,0.000000,2.955034,2.955034
34291,1508,686,2.5,2.844828,2.400000,29,5,1.018608,0.894427,4.0,...,2.738872,2,2.72381,0.105955,0.121018,0.333332,0.000000,2.844828,2.955034,2.955034
34292,1508,693,3.5,2.844828,2.687500,29,8,1.018608,1.066955,4.0,...,2.738872,2,2.72381,0.105955,0.121018,0.333332,0.000000,2.844828,2.955034,2.955034
34293,1508,751,1.0,2.844828,1.000000,29,3,1.018608,0.500000,4.0,...,2.738872,2,2.72381,0.105955,0.121018,0.333332,0.499998,2.844828,2.955034,2.955034


In [None]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34295 entries, 0 to 34294
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   user_id                           34295 non-null  int64  
 1   item_id                           34295 non-null  int64  
 2   label                             34295 non-null  float64
 3   user_avg_rating                   34295 non-null  float64
 4   item_avg_rating                   34295 non-null  float64
 5   user_count_rating                 34295 non-null  int64  
 6   item_count_rating                 34295 non-null  int64  
 7   user_std_rating                   34295 non-null  float64
 8   item_std_rating                   34295 non-null  float64
 9   user_max_rating                   34295 non-null  float64
 10  user_min_rating                   34295 non-null  float64
 11  item_min_rating                   34295 non-null  float64
 12  item

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716 entries, 0 to 1715
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   user_id                           1716 non-null   int64  
 1   item_id                           1716 non-null   int64  
 2   id                                1716 non-null   int64  
 3   user_avg_rating                   1716 non-null   float64
 4   item_avg_rating                   1716 non-null   float64
 5   user_count_rating                 1716 non-null   int64  
 6   item_count_rating                 1716 non-null   int64  
 7   user_std_rating                   1716 non-null   float64
 8   item_std_rating                   1716 non-null   float64
 9   user_max_rating                   1716 non-null   float64
 10  user_min_rating                   1716 non-null   float64
 11  item_min_rating                   1716 non-null   float64
 12  item_m

In [None]:

corr_matrix = ratings_df.corr(numeric_only=True)

high_corr = (
    corr_matrix.where(~np.eye(corr_matrix.shape[0], dtype=bool)) 
    .stack()
    .reset_index()
)
high_corr.columns = ["feature_1", "feature_2", "correlation"]
high_corr = high_corr[high_corr["correlation"].abs() > 0.95]


high_corr["pair"] = high_corr.apply(lambda row: tuple(sorted([row["feature_1"], row["feature_2"]])), axis=1)
high_corr = high_corr.drop_duplicates(subset="pair").drop(columns="pair")


features_to_drop = set(high_corr["feature_2"])
ratings_df = ratings_df.drop(columns=features_to_drop)


print("Features dropped due to high correlation:")
print(features_to_drop)

high_corr



Features dropped due to high correlation:
{'trustee_avg_rating_bias', 'user_weighted_trust', 'item_avg_active_user_interaction', 'trustor_avg_rating_bias'}


Unnamed: 0,feature_1,feature_2,correlation
997,trustor_count_for_movie,user_weighted_trust,0.992813
1273,trustor_avg_rating,trustor_avg_rating_bias,-0.971962
1354,trustee_avg_rating,trustee_avg_rating_bias,-0.972717
1599,user_avg_similar_item_rating,item_avg_active_user_interaction,1.0


In [None]:
X = ratings_df.drop(columns=["label", "user_id", "item_id"])
y = ratings_df["label"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    "n_estimators": randint(200, 400),
    "max_depth": randint(5, 15),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.7, 0.3),
    "colsample_bytree": uniform(0.7, 0.3)
}

xgb = XGBRegressor(random_state=42)
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_model_xgb = search.best_estimator_

y_pred = best_model_xgb.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)

MSE: 0.00010323451591317836
RMSE: 0.010160438765780657
MAE: 0.0025007643336751994
R²: 0.999874547701817


In [None]:
# X = ratings_df.drop(columns=["label", "user_id", "item_id"])
# y = ratings_df["label"]

# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)

# X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# knn = KNeighborsRegressor()
# param_grid = {
#     "n_neighbors": [3, 5, 7, 10, 15, 20],
#     "weights": ["uniform", "distance"],
#     "metric": ["euclidean", "manhattan"]
# }

# grid = GridSearchCV(
#     estimator=knn,
#     param_grid=param_grid,
#     scoring="neg_mean_squared_error",
#     cv=5,
#     verbose=1,
#     n_jobs=-1
# )

# grid.fit(X_train, y_train)

# best_model_knn = grid.best_estimator_

# y_pred = best_model_knn.predict(X_val)
# print("MSE:", mean_squared_error(y_val, y_pred))
# print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))
# print("MAE:", mean_absolute_error(y_val, y_pred))
# print("R²:", r2_score(y_val, y_pred))

In [None]:
# feature_columns = X.columns.tolist()
# X_test = scaler.transform(test_df[feature_columns])

# test_df["label"] = best_model_knn.predict(X_test)

# submission = test_df[["label"]].copy()
# submission["id"] = range(1, len(submission) + 1)
# submission = submission[["id", "label"]]
# submission.to_csv("knn_submission_.csv", index=False)


In [None]:
feature_columns = X.columns.tolist()
X_test = test_df[feature_columns]

test_df["label"] = best_model_xgb.predict(X_test)

submission = test_df[["label"]].copy()
submission["id"] = range(1, len(submission) + 1)  
submission = submission[["id", "label"]]
submission.to_csv("xgb_submission.csv", index=False)