In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

In [2]:
submission_sample = pd.read_csv('submission_sample.csv')

In [3]:
submission_sample.head()

Unnamed: 0,user_id,item_id
0,0,0 1 2 3 4 5 6 7 8 9
1,1,0 1 2 3 4 5 6 7 8 9
2,2,0 1 2 3 4 5 6 7 8 9
3,3,0 1 2 3 4 5 6 7 8 9
4,4,0 1 2 3 4 5 6 7 8 9


In [4]:
events = pd.read_csv('events.csv')

In [5]:
events.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4


In [6]:
items = pd.read_csv('item_features.csv')

In [7]:
items.head()

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [8]:
users = pd.read_csv('user_features.csv')

In [9]:
users.head()

Unnamed: 0,user_id,gender,age
0,4855,F,1
1,4065,M,56
2,3331,M,25
3,5373,M,45
4,2032,M,25


In [10]:
events.isna().sum()

Unnamed: 0,0
user_id,0
item_id,0
rating,0
timestamp,0


In [11]:
users.isna().sum()

Unnamed: 0,0
user_id,0
gender,0
age,0


In [12]:
items.isna().sum()

Unnamed: 0,0
item_id,0
genre_0,0
genre_1,0
genre_2,0
genre_3,0
genre_4,0
genre_5,0
genre_6,0
genre_7,0
genre_8,0


In [13]:
events.rating.unique()

array([4, 3, 5, 2, 1])

In [14]:
add_features_to_user = events.groupby('user_id').agg(
    average_rating = ('rating', 'mean'),
    num_rated=('rating', 'count'),
    high_rating_ratio=('rating', lambda x: (x >= 4).mean()),
    max_rating=('rating', 'max'),
    min_rating=('rating', 'min'),
).reset_index()

In [15]:
user_features = pd.merge(users, add_features_to_user, on='user_id', how='left')

In [16]:
user_features.head()

Unnamed: 0,user_id,gender,age,average_rating,num_rated,high_rating_ratio,max_rating,min_rating
0,4855,F,1,4.162791,43,0.813953,5,3
1,4065,M,56,3.677966,118,0.559322,5,1
2,3331,M,25,3.933333,45,0.755556,5,1
3,5373,M,45,4.176471,17,0.823529,5,1
4,2032,M,25,3.169591,171,0.421053,5,1


In [17]:
user_features.age.unique()

array([ 1, 56, 25, 45, 50, 35, 18])

In [18]:
user_features['age_group'] = pd.cut(user_features['age'], bins=[0, 18, 35, 50, 100],
                                labels=['1-18', '19-35', '36-50', '51+'])

In [19]:
user_features.head()

Unnamed: 0,user_id,gender,age,average_rating,num_rated,high_rating_ratio,max_rating,min_rating,age_group
0,4855,F,1,4.162791,43,0.813953,5,3,1-18
1,4065,M,56,3.677966,118,0.559322,5,1,51+
2,3331,M,25,3.933333,45,0.755556,5,1,19-35
3,5373,M,45,4.176471,17,0.823529,5,1,36-50
4,2032,M,25,3.169591,171,0.421053,5,1,19-35


In [20]:
genre_count = len(items.columns) - 1
genre_count

18

In [21]:
genre_interaction = events.merge(items, on='item_id', how='left')
genre_interaction = genre_interaction.groupby(['user_id']).agg(
    **{f'genre_interaction_{i}_count': ('genre_' + str(i), 'sum') for i in range(genre_count)}
).reset_index()

In [22]:
genre_interaction.head()

Unnamed: 0,user_id,genre_interaction_0_count,genre_interaction_1_count,genre_interaction_2_count,genre_interaction_3_count,genre_interaction_4_count,genre_interaction_5_count,genre_interaction_6_count,genre_interaction_7_count,genre_interaction_8_count,genre_interaction_9_count,genre_interaction_10_count,genre_interaction_11_count,genre_interaction_12_count,genre_interaction_13_count,genre_interaction_14_count,genre_interaction_15_count,genre_interaction_16_count,genre_interaction_17_count
0,0,95,61,26,43,105,20,0,71,19,4,22,18,13,54,47,65,17,4
1,1,85,34,7,9,68,32,3,69,7,5,32,5,9,24,61,105,14,5
2,2,83,41,8,8,35,11,1,30,4,1,11,3,4,15,42,44,9,2
3,3,136,62,9,16,84,17,0,30,9,0,18,4,6,16,76,66,13,4
4,4,43,12,4,4,39,10,0,36,1,0,2,3,2,13,19,28,6,3


In [23]:
user_features = genre_interaction.merge(user_features, on='user_id', how='left')

In [24]:
user_features.head()

Unnamed: 0,user_id,genre_interaction_0_count,genre_interaction_1_count,genre_interaction_2_count,genre_interaction_3_count,genre_interaction_4_count,genre_interaction_5_count,genre_interaction_6_count,genre_interaction_7_count,genre_interaction_8_count,...,genre_interaction_16_count,genre_interaction_17_count,gender,age,average_rating,num_rated,high_rating_ratio,max_rating,min_rating,age_group
0,0,95,61,26,43,105,20,0,71,19,...,17,4,M,35,3.979094,287,0.735192,5,1,19-35
1,1,85,34,7,9,68,32,3,69,7,...,14,5,M,18,3.64751,261,0.701149,5,1,1-18
2,2,83,41,8,8,35,11,1,30,4,...,9,2,M,25,3.797203,143,0.72028,5,1,19-35
3,3,136,62,9,16,84,17,0,30,9,...,13,4,M,18,3.350649,231,0.450216,5,1,1-18
4,4,43,12,4,4,39,10,0,36,1,...,6,3,M,18,4.046729,107,0.728972,5,2,1-18


In [25]:
item_features = items.copy()

In [26]:
item_features['num_genres'] = item_features.iloc[:, 1:].sum(axis=1)
item_features['average_rating'] = events.groupby('item_id')['rating'].mean()

In [27]:
item_features.head()

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,num_genres,average_rating
0,0,0,1,0,1,1,0,0,0,1,...,0,0,0,1,0,0,0,0,5,2.395522
1,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,3.285714
2,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,3.655963
3,3,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,2,3.966667
4,4,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,3.590909


In [28]:
user_item_matrix = events.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [29]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_item_matrix)

In [30]:
def get_knn_recommendations(user_id, user_item_matrix, model_knn, n_recommendations=10):
    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=11)
    similar_users_indices = indices.flatten()[1:]
    similar_users_ratings = user_item_matrix.iloc[similar_users_indices]
    recommendations = similar_users_ratings.sum(axis=0)

    already_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index

    recommendations = recommendations.drop(already_rated)
    top_recommendations = recommendations.nlargest(n_recommendations).index.tolist()

    return top_recommendations

In [31]:
def get_content_based_recommendations(user_id, user_item_matrix, item_data, n_recommendations=10):
    user_interactions = user_item_matrix.loc[user_id]

    rated_items = user_interactions[user_interactions > 0].index
    rated_genres = item_data[item_data['item_id'].isin(rated_items)].iloc[:, 1:-1].values

    if rated_genres.shape[0] == 0:
        return []

    user_genre_vector = rated_genres.mean(axis=0).reshape(1, -1)

    genre_data = item_data.iloc[:, 1:-1].values
    genre_similarity = cosine_similarity(user_genre_vector, genre_data)

    recommendations = pd.Series(genre_similarity.flatten(), index=item_data['item_id'])
    already_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = recommendations.drop(already_rated)
    top_recommendations = recommendations.nlargest(n_recommendations).index.tolist()

    return top_recommendations

In [32]:
recommendations = {}

for user_id in user_item_matrix.index:
    knn_recs = get_knn_recommendations(user_id, user_item_matrix, model_knn)
    content_recs = get_content_based_recommendations(user_id, user_item_matrix, item_features)
    combined_recs = list(set(knn_recs + content_recs))
    recommendations[user_id] = combined_recs[:10]

In [39]:
results = [(user_id, ' '.join(map(str, recs))) for user_id, recs in recommendations.items()]

In [40]:
recommendations_df = pd.DataFrame(results, columns=['user_id', 'item_id'])

In [41]:
recommendations_df

Unnamed: 0,user_id,item_id
0,0,1543 1551 1811 3349 2968 2602 3371 2350 1454 2480
1,1,1548 1039 2960 1551 1811 1044 36 680 169 1067
2,2,640 1039 1044 2968 3358 2210 1956 293 3238 1831
3,3,2328 3358 799 2342 52 183 3002 3005 3390 3529
4,4,1030 146 3350 1814 1560 2076 3358 37 2862 942
...,...,...
6035,6035,515 1044 3349 2968 3358 680 1067 2993 563 1855
6036,6036,2688 1030 1039 146 1560 1304 796 36 2862 942
6037,6037,2688 772 1030 2828 146 1304 2603 942 3638 573
6038,6038,640 0 1543 1808 146 2067 20 2221 942 1583


In [42]:
recommendations_df.to_csv('recommendations.csv', index=False)