In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error



In [2]:
RATING_FILE = 'train_data_movie_rate.csv'
TRUST_FILE  = 'train_data_movie_trust.csv'
TEST_FILE   = 'test_data.csv'  
OUT_FILE    = 'predictions.csv'


In [3]:
ratings = pd.read_csv(RATING_FILE)
trust   = pd.read_csv(TRUST_FILE) 
test_df = pd.read_csv(TEST_FILE)   


In [4]:
ratings.info

<bound method DataFrame.info of           id  user_id  item_id  label
0          1        1        1    2.0
1          2        1        2    4.0
2          3        1        3    3.5
3          4        1        4    3.0
4          5        1        5    4.0
...      ...      ...      ...    ...
34293  34294     1508       84    3.5
34294  34295     1508       17    4.0
34295  34296     1508      669    1.0
34296  34297     1508      686    2.5
34297  34298     1508      806    3.5

[34298 rows x 4 columns]>

In [5]:
ratings = ratings[~ratings[["user_id", "item_id"]].duplicated()]

In [6]:
rating_matrix = ratings.pivot(index="user_id", columns="item_id", values="label").fillna(-1)

In [7]:
rating_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,2062,2063,2064,2065,2066,2067,2068,2069,2070,2071
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,4.0,3.5,3.0,4.0,3.5,3.5,3.0,2.5,4.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,0.5,-1.0,-1.0,-1.0,3.5,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,3.0,-1.0,2.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.5,-1.0,-1.0,3.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3.0
1505,4.0,3.0,2.0,3.0,4.0,-1.0,4.0,3.0,4.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1506,-1.0,-1.0,-1.0,1.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1507,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [8]:
trust_matrix = np.full((1508, 1508), 0, dtype=np.float64)

for _, row in trust.iterrows():
    i = row['user_id_trustor'] - 1
    j = row['user_id_trustee'] - 1
    if(i >= 1508 or j >= 1508):
        continue
    trust_matrix[i, j] = 1
    
np.fill_diagonal(trust_matrix, 1)

trust_matrix = pd.DataFrame(trust_matrix[:1508, :1508], index=range(1, 1509), columns=range(1, 1509))


In [None]:
def make_features(x):
    user_id = x['user_id']
    item_id = x['item_id']
    
    t_full = trust_matrix.loc[user_id].values.flatten()

    r_series = rating_matrix[item_id].copy()
    y = r_series.loc[user_id]  

    r_series.loc[user_id] = -1  
    r_series = r_series.fillna(-1)

    r = r_series.values
    t = t_full[r_series.index.to_numpy() - 1]  

    valid_mask = (r != -1) & (t != 0)
    if np.sum(t[valid_mask]) > 0:
        weighted_avg_rating = np.dot(r[valid_mask], t[valid_mask]) / np.sum(t[valid_mask])
    else:
        weighted_avg_rating = -1

    stats = np.array([t.mean(), t.std(), weighted_avg_rating])

    return np.concatenate([r, t, stats], axis=0), y


In [18]:
xy = ratings.apply(make_features, axis=1)
X, y = zip(*xy)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=True)


In [20]:
SEED = 42

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, scale_pos_weight=1, subsample=0.8, missing=-1)
xgb_model.fit(X_train, y_train)


In [22]:
test_feats = test_df.apply(lambda row: make_features(row)[0], axis=1)
X_test = np.vstack(test_feats)

test_df['label'] = xgb_model.predict(X_test)

test_df.reset_index(drop=True, inplace=True)
test_df['id'] = test_df.index + 1
out = test_df[['id', 'label']]
out.to_csv(OUT_FILE, index=False)
