In [1]:
import pandas as pd
import numpy as np

import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv(r"D:\Рабочий стол\Мэг_питон\ML\База данных\ratings_df_sample_2.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,54,2,3.0,974918176
1,54,32,5.0,974836809
2,54,47,4.0,974837760
3,54,50,4.0,974837760
4,54,223,5.0,974840217


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040099 entries, 0 to 6040098
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 184.3 MB


In [4]:
df.isna().sum()
# пропусков не обнаружено

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [5]:
# новый целевой признак «факт просмотра фильма пользователем», равный 1 для всех
df['view_fact'] = 1
df.sample(7)

Unnamed: 0,userId,movieId,rating,timestamp,view_fact
1364865,56687,48780,4.0,1232035816,1
3773537,107009,1278,3.0,975370373,1
2262777,94357,1408,4.5,1123936700,1
65056,2558,1219,4.0,1263841557,1
1597742,66825,2959,5.0,1107863583,1
2184754,91080,1405,3.5,1167162642,1
636472,25431,6807,5.0,1221011389,1


In [6]:
%%time
# масштабируем идентификаторы фильмов,
# чтобы они начинались с 0 и заканчивались на n_movies-1
movies_values = df['movieId'].unique()
df['movieId'] = df['movieId'].apply(lambda f: np.where(movies_values == f)[0][0])

CPU times: total: 51.7 s
Wall time: 54.5 s


In [7]:
# масштабируем идентификаторы пользователей (от 0 до n_movies-1)
users_values = df['userId'].unique()
df['userId'] = df['userId'].apply(lambda f: np.where(users_values == f)[0][0])

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,view_fact
0,0,0,3.0,974918176,1
1,0,1,5.0,974836809,1
2,0,2,4.0,974837760,1
3,0,3,4.0,974837760,1
4,0,4,5.0,974840217,1


2. Генерация пар с нулевыми значениями:

In [9]:
users = df['userId'].unique()
movies = df['movieId'].unique()

In [10]:
# генерируем случайные пары пользователь*фильм
random_pairs = []
for i in range (6040099*2):
    pair = (np.random.choice(users), np.random.choice(movies))
    random_pairs.append(pair)

In [11]:
random_pairs = pd.DataFrame(random_pairs)
random_pairs.columns=['userId', 'movieId']
random_pairs.head()

Unnamed: 0,userId,movieId
0,2175,314
1,2118,79
2,7581,912
3,15069,731
4,8959,18


In [12]:
random_pairs.shape

(12080198, 2)

In [13]:
random_pairs.duplicated().sum()

3014748

In [14]:
random_pairs.drop_duplicates(inplace=True)

In [15]:
random_pairs.shape

(9065450, 2)

In [16]:
pairs_1 = df[['userId', 'movieId']].copy()

In [17]:
random_pairs = set(tuple(x) for x in random_pairs.values)
pairs_1 = set(tuple(x) for x in pairs_1.values)

In [18]:
random_pairs_0 = pd.DataFrame(random_pairs.difference(pairs_1))
# возвращаем пары, кот. нет в датафрейме с view_fact=1

In [19]:
random_pairs_0.shape

(6327402, 2)

In [20]:
random_pairs_0 = random_pairs_0.sample(6040099)
random_pairs_0.columns= ['userId', 'movieId']
random_pairs_0 

Unnamed: 0,userId,movieId
1359668,19540,524
1205751,16546,802
5753317,7981,730
5888129,11522,140
609068,6276,30
...,...,...
4335453,18066,819
1355215,13302,111
1573299,3117,142
5685308,2470,625


In [21]:
df = pd.concat([df, random_pairs_0], ignore_index=True)

In [22]:
df[['userId', 'movieId']].duplicated().sum()
# дубликатов нет

0

In [23]:
df.shape

(12080198, 5)

In [24]:
df = df.fillna(0) # заполняем пропуски и view_fact нулями 

In [25]:
df.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,view_fact
6864217,18704,423,0.0,0.0,0.0
9636997,7404,938,0.0,0.0,0.0
969157,2804,317,3.5,1137602000.0,1.0
8504139,16156,49,0.0,0.0,0.0
431031,1253,529,0.5,1365912000.0,1.0


4. Pазделение данных на обучающую и тестовую части

In [26]:
train_data, test_data = train_test_split(df, test_size=0.01, random_state=42)

In [27]:
target_train = train_data['view_fact']
features_train = train_data.drop(['view_fact'], axis=1)
target_test = test_data['view_fact']
features_test = test_data.drop(['view_fact'], axis=1)

5. Создание простой модели

In [28]:
def dummy_model(features):
    return [np.random.random() for i in range(len(features))]

In [29]:
dummy_predictions = dummy_model(features_test)
print("AUC-ROC простой модели: %.3f"% roc_auc_score(target_test, dummy_predictions))

AUC-ROC простой модели: 0.499


AUC-ROC случайной модели равна 0.5.

6. Алгоритм коллаборативной фильтрации

### User-based

In [30]:
n_users = train_data['userId'].nunique()
n_movies = train_data['movieId'].nunique()
n_users, n_movies

(20000, 1000)

In [31]:
# формируем матрицу user-item
train_data_matrix = np.array(pd.pivot_table(train_data,
               index = 'userId',
               columns = 'movieId',
               values = 'view_fact', fill_value=0))

In [32]:
train_data_matrix.shape

(20000, 1000)

In [33]:
train_data_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# считаем попарное косинусное расстояние для пользователей (строк матрицы)
user_similarity = cosine_distances(train_data_matrix)

In [35]:
# считаем попарное косинусное расстояние для фильмов (столбцов матрицы)
movie_similarity = cosine_distances(train_data_matrix.T)

In [36]:
user_similarity

array([[0.        , 0.46999486, 0.43312502, ..., 0.86898606, 0.99594839,
        0.9428738 ],
       [0.46999486, 0.        , 0.39631106, ..., 0.74610396, 0.77258523,
        0.89907003],
       [0.43312502, 0.39631106, 0.        , ..., 0.80626137, 0.75734966,
        0.88891936],
       ...,
       [0.86898606, 0.74610396, 0.80626137, ..., 0.        , 0.67107049,
        0.7451764 ],
       [0.99594839, 0.77258523, 0.75734966, ..., 0.67107049, 0.        ,
        0.78722869],
       [0.9428738 , 0.89907003, 0.88891936, ..., 0.7451764 , 0.78722869,
        0.        ]])

In [37]:
movie_similarity

array([[0.        , 0.39217953, 0.38115528, ..., 0.75528843, 0.59046783,
        0.7806608 ],
       [0.39217953, 0.        , 0.21273945, ..., 0.66536621, 0.53116134,
        0.68171548],
       [0.38115528, 0.21273945, 0.        , ..., 0.67375632, 0.48847154,
        0.72370895],
       ...,
       [0.75528843, 0.66536621, 0.67375632, ..., 0.        , 0.77905991,
        0.64830551],
       [0.59046783, 0.53116134, 0.48847154, ..., 0.77905991, 0.        ,
        0.84429621],
       [0.7806608 , 0.68171548, 0.72370895, ..., 0.64830551, 0.84429621,
        0.        ]])

In [38]:
user_similarity.shape, movie_similarity.shape

((20000, 20000), (1000, 1000))

In [39]:
# Для каждого пользователя находим топ 10 ближайших соседей, исключая себя самого
top=10
top_similar_users = []
for i in range(n_users):
    neighbors = (user_similarity[i]).argsort()[1:top + 1]
    top_similar_users.append(
        train_data_matrix[neighbors]
    )
top_similar_users = np.array(top_similar_users)

In [40]:
top_similar_users.shape

(20000, 10, 1000)

In [41]:
predicted_view_fact_user_based = top_similar_users.mean(1) 
#создаем матрицу предсказаний используя среднее знач просмотров топ-10

In [42]:
predicted_view_fact_user_based.shape

(20000, 1000)

In [43]:
test_data['predict_user_based'] = test_data.apply(
    lambda f: (predicted_view_fact_user_based[int(f['userId']),
     int(f['movieId'])]), axis = 1)

In [44]:
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,view_fact,predict_user_based
3120342,8984,62,4.5,1075840000.0,1.0,1.0
6066966,14190,177,0.0,0.0,0.0,0.5
6186099,9716,252,0.0,0.0,0.0,1.0
1619138,4675,759,3.5,1234074000.0,1.0,0.6
5162803,16085,325,3.0,984794400.0,1.0,1.0


In [45]:
prediction = np.where(test_data['predict_user_based'] >= 0.5, 1, 0)
print("AUC-ROC user-based_топ10: %.3f"% roc_auc_score(test_data['view_fact'], 
prediction))

AUC-ROC user-based_топ10: 0.776


In [46]:
top=5
top_similar_users_5 = []
for i in range(n_users):
    neighbors = (user_similarity[i]).argsort()[1:top + 1]
    top_similar_users_5.append(
        train_data_matrix[neighbors]
    )
top_similar_users_5 = np.array(top_similar_users_5)


predicted_view_fact_user_based_5 = top_similar_users_5.mean(1) 

test_data['predict_user_based_5'] = test_data.apply(
    lambda f: (predicted_view_fact_user_based_5[int(f['userId']),
     int(f['movieId'])]), axis = 1)


prediction_2 = np.where(test_data['predict_user_based_5'] >= 0.5, 1, 0)
print("AUC-ROC user-based_топ5: %.3f"% roc_auc_score(test_data['view_fact'], 
prediction_2))

AUC-ROC user-based_топ5: 0.773


In [47]:
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,view_fact,predict_user_based,predict_user_based_5
3120342,8984,62,4.5,1075840000.0,1.0,1.0,1.0
6066966,14190,177,0.0,0.0,0.0,0.5,0.6
6186099,9716,252,0.0,0.0,0.0,1.0,1.0
1619138,4675,759,3.5,1234074000.0,1.0,0.6,0.8
5162803,16085,325,3.0,984794400.0,1.0,1.0,1.0


In [49]:
top=25
top_similar_users_25 = []
for i in range(n_users):
    neighbors = (user_similarity[i]).argsort()[1:top + 1]
    top_similar_users_25.append(
        train_data_matrix[neighbors]
    )
top_similar_users_25 = np.array(top_similar_users_25)


predicted_view_fact_user_based_25 = top_similar_users_25.mean(1) 

test_data['predict_user_based_25'] = test_data.apply(
    lambda f: (predicted_view_fact_user_based_25[int(f['userId']), 
    int(f['movieId'])]), axis = 1)


prediction_3 = np.where(test_data['predict_user_based_25'] >= 0.5, 1, 0)
print("AUC-ROC user-based_топ25: %.3f"% roc_auc_score(test_data['view_fact'], 
prediction_3))

AUC-ROC user-based_топ25: 0.781


Качество user_based улучшается при увеличении количества похожих пользователей

### item-based

Для каждого фильма находим топ 10 ближайших соседей, исключая себя 

In [50]:
top=10
top_similar_movie = []
for i in range(n_movies):
    neighbors = (movie_similarity[i]).argsort()[1:top + 1]
    top_similar_movie.append(
        train_data_matrix.T[neighbors]
    )
    
top_similar_movie = np.array(top_similar_movie)

In [51]:
top_similar_movie.shape

(1000, 10, 20000)

In [52]:
predicted_view_fact_movie_based = top_similar_movie.mean(1).T

In [53]:
predicted_view_fact_movie_based.shape

(20000, 1000)

In [54]:
test_data['predict_item_based'] = test_data.apply(
    lambda f: (predicted_view_fact_movie_based[int(f['userId']), 
    int(f['movieId'])]), axis = 1)

In [55]:
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,view_fact,predict_user_based,predict_user_based_5,predict_user_based_20,predict_user_based_25,predict_item_based
3120342,8984,62,4.5,1075840000.0,1.0,1.0,1.0,1.0,0.96,1.0
6066966,14190,177,0.0,0.0,0.0,0.5,0.6,0.6,0.64,0.5
6186099,9716,252,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.8
1619138,4675,759,3.5,1234074000.0,1.0,0.6,0.8,0.7,0.64,0.9
5162803,16085,325,3.0,984794400.0,1.0,1.0,1.0,0.9,0.88,0.8


In [56]:
prediction = np.where(test_data['predict_item_based'] >= 0.5, 1, 0)
print("AUC-ROC item-based_топ10: %.3f"% roc_auc_score(test_data['view_fact'], 
        prediction))

AUC-ROC item-based_топ10: 0.746


In [None]:
top=25
top_similar_movie_25 = []
for i in range(n_movies):
    neighbors = (movie_similarity[i]).argsort()[1:top + 1]
    top_similar_movie_25.append(
        train_data_matrix.T[neighbors]
    )
    
top_similar_movie_25 = np.array(top_similar_movie_25)

predicted_view_fact_movie_based_25 = top_similar_movie_25.mean(1).T

test_data['predict_item_based_25'] = test_data.apply(
    lambda f: (predicted_view_fact_movie_based_25[int(f['userId']), 
    int(f['movieId'])]), axis = 1)

In [None]:
prediction_2i = np.where(test_data['predict_item_based_25'] >= 0.5, 1, 0)
print("AUC-ROC item-based_топ25: %.3f"% roc_auc_score(test_data['view_fact'], 
prediction_2i))

AUC-ROC item-based_топ25: 0.745


На item-based увеличение количества фильмов не влияет, тк по сравнению с пользователями не такая большая выборка

### Матричная факторизация

In [None]:
train_data_matrix.shape

(20000, 1000)

In [None]:
# SVD
train_data_matrix = train_data_matrix.astype(float)
u, s, vh = svds(train_data_matrix, k=20)
s_diag_matrix = np.diag(s)

users = np.dot(u, s_diag_matrix)
items = vh.T

In [None]:
np.diag(s).shape, users.shape, items.shape

((20, 20), (20000, 20), (1000, 20))

In [None]:
test_data['svd_predictions'] = test_data.apply(
    lambda f: (np.dot(users[int(f['userId'])], items[int(f['movieId'])])), 
    axis = 1)

In [None]:
prediction = np.where(test_data['svd_predictions'] >= 0.5, 1, 0)
print("AUC-ROC matrix factorization_20: %.3f"% roc_auc_score(
    test_data['view_fact'], prediction))

AUC-ROC matrix factorization_20: 0.762


In [None]:
train_data_matrix = train_data_matrix.astype(float)
u, s, vh = svds(train_data_matrix, k=40)
s_diag_matrix = np.diag(s)

users = np.dot(u, s_diag_matrix)
items = vh.T


test_data['svd_predictions'] = test_data.apply(
    lambda f: (np.dot(users[int(f['userId'])], items[int(f['movieId'])])),
    axis = 1)

prediction = np.where(test_data['svd_predictions'] >= 0.5, 1, 0)
print("AUC-ROC matrix factorization_40: %.3f"% roc_auc_score(
    test_data['view_fact'], prediction))

AUC-ROC matrix factorization_40: 0.772


При увеличении кол. элементов в матрице - улучшается качество (до определенного момента, при разложении выше 60 - снова падает)

Наиболее эффективной оказалась user_based рекомендация при выделении топ 25 пользователей - AUC-ROC = 0.781  
На втором месте матричное разложение с AUC-ROC = 0.772  
AUC-ROC item-based = 0.746  
AUC-ROC простой модели = 0.5