In [5]:
#在这一部分，我们接上之前最好的模型结构构造推荐系统，因此分为两阶段
#第一阶段，重新训练并搭建表现最好的模型
#第二阶段，基于该模型进行推荐系统搭建
#搭建原理：穷举遍历，回归预测，高低排序，字典对应
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from math import sqrt
import pandas as pd
import numpy as np

In [4]:
#GBDT重训练
data1 = pd.read_csv("data1.csv")
data1.drop(data1.columns[0], axis=1, inplace=True)
y = data1['rating']
X = data1.drop(columns=['rating'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
scaler_means = scaler.mean_
scaler_stds = scaler.scale_
param_grid = {
    'n_estimators': [100, 250, 500, 750],
    'max_depth': [3, 4, 5]
}
model = GradientBoostingRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
print(f'Best parameters found: {grid_search.best_params_}')
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Best parameters found: {'max_depth': 5, 'n_estimators': 750}
Mean Squared Error: 0.8804840261712744


In [17]:
ref = pd.read_csv("merge2.csv")
def recommend_top_n(user_id, n=5):
    user_data = data1[data1['user id'] == user_id]
    
    if user_data.empty:
        return f"User ID {user_id} not found."

    user_data = user_data.drop(columns=['rating'])
    user_data_scaled = (user_data - scaler_means) / scaler_stds
    user_predictions = best_model.predict(user_data_scaled)
    user_data['predicted_rating'] = user_predictions
    user_data = user_data.drop_duplicates(subset='item id', keep='first')
    top_n_indices = user_data.nlargest(n, 'predicted_rating').index
    recommended_items = user_data.loc[top_n_indices, 'item id'].values
    recommended_scores = user_data.loc[top_n_indices, 'predicted_rating'].values
    recommended_titles = ref[ref['item id'].isin(recommended_items)].drop_duplicates(subset='item id', keep='first')['title'].values
    recommendations = [(title, score) for title, score in zip(recommended_titles, recommended_scores)]
    return recommendations

user_id = 1
recommendations = recommend_top_n(user_id, n=5)
for title, score in recommendations:
    print(f"Movie: {title}, Predicted Rating: {score:.2f}")

Movie: Empire Strikes Back, The, Predicted Rating: 4.80
Movie: Wrong Trousers, The, Predicted Rating: 4.72
Movie: Star Wars, Predicted Rating: 4.61
Movie: Wallace & Gromit: The Best of Aardman Animation, Predicted Rating: 4.60
Movie: Haunted World of Edward D. Wood Jr., The, Predicted Rating: 4.59




In [40]:
ref = pd.read_csv("merge2.csv")
user_id=1
user_data = data1[data1['user id'] == user_id]
user_features = user_data[['user id', 'age', 'gender', 'occupation', 'zip code']].drop_duplicates()
user_features

Unnamed: 0,user id,age,gender,occupation,zip code
191,1,2,1,1,70


In [49]:
item_data = data1.drop(columns=['rating', 'user id', 'age', 'gender', 'occupation', 'zip code'])
item_data['timestamp'] = item_data.groupby('item id')['timestamp'].transform('mean')
item_data = item_data.drop_duplicates(subset='item id')
item_data

Unnamed: 0,item id,timestamp,release_date,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,242,8.168142,1996,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,302,9.472222,1997,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,377,6.230769,1994,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,8.259740,1994,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,346,12.714286,1997,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93594,1674,16.000000,1962,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94753,1640,13.000000,1996,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95495,1637,13.000000,1996,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95699,1630,12.000000,1994,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
user_item_data = pd.concat([user_features] * len(item_data), ignore_index=True)
user_item_data = pd.concat([user_item_data, item_data.reset_index(drop=True)], axis=1)
user_item_data_scaled = (user_item_data - scaler_means) / scaler_stds
user_item_data_scaled

Unnamed: 0,user id,age,gender,occupation,zip code,item id,timestamp,release_date,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,47.078840,-1.377940,0.587690,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
1,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,59.098815,-1.327446,0.657826,-0.585088,-0.39767,...,-0.116743,7.585319,-0.236438,-0.229482,4.235042,-0.492519,-0.380001,1.896096,-0.322722,-0.138278
2,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,74.123785,-1.452956,0.447417,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
3,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,8.815251,-1.374394,0.447417,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,2.030378,-0.380001,-0.527399,3.098639,7.231809
4,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,67.913464,-1.201911,0.657826,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,333.955592,-1.074686,-1.796945,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
1674,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,327.144272,-1.190848,0.587690,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
1675,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,326.543274,-1.190848,0.587690,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278
1676,-1.743636,-1.280432,-1.332633,-1.179537,158.088536,325.140943,-1.229568,0.447417,-0.585088,-0.39767,...,-0.116743,-0.131834,-0.236438,-0.229482,-0.236125,-0.492519,-0.380001,-0.527399,-0.322722,-0.138278


In [59]:
user_predictions = best_model.predict(user_item_data_scaled)
user_item_data['predicted_rating'] = user_predictions
top_n = 10
top_n_indices = user_item_data.nlargest(top_n, 'predicted_rating').index
recommended_items = user_item_data.loc[top_n_indices, 'item id'].values
recommended_scores = user_item_data.loc[top_n_indices, 'predicted_rating'].values
recommended_titles = ref[ref['item id'].isin(recommended_items)].drop_duplicates(subset='item id', keep='first')['title'].values
recommendations = [(title, score) for title, score in zip(recommended_titles, recommended_scores)]
for title, score in recommendations:
    print(f"Movie: {title}, Predicted Rating: {score:.2f}")

Movie: Monty Python's Life of Brian, Predicted Rating: 5.12
Movie: Close Shave, A, Predicted Rating: 4.82
Movie: Return of the Pink Panther, The, Predicted Rating: 4.82
Movie: Monty Python and the Holy Grail, Predicted Rating: 4.81
Movie: Sleeper, Predicted Rating: 4.80
Movie: Being There, Predicted Rating: 4.79
Movie: Manhattan, Predicted Rating: 4.77
Movie: Harold and Maude, Predicted Rating: 4.76
Movie: Ruling Class, The, Predicted Rating: 4.76
Movie: Private Benjamin, Predicted Rating: 4.76




In [None]:
#后续的思考修改
#很可惜，不小心把id标准化掉了，这个不应该参与训练，后续修改
#embedding的问题，可能是导致高级模型表现反而差的原因
#双塔模型，以及推荐逻辑（现在的推荐逻辑是遍历，组装新表，已经接近双塔的思想了，但是id处理有问题）
#因为item id和user id的标准化问题，导致处理不是很正确，重新标准化了测试集，可能导致数据泄露