In [8]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Integer, String, Column

engine = create_engine(
    'mysql://admin:4kDGQDYe4JxDjRd@localhost:3306/diploma', connect_args={}
)
Session = sessionmaker(autocommit=False, autoflush=False, bind=engine)

Base = declarative_base()

session = Session()

class Rating(Base):
    __tablename__ = "ratings"

    id = Column(Integer, autoincrement=True, primary_key=True, index=True)
    movieId = Column(Integer, nullable=False)
    userId = Column(Integer, nullable=False)
    rating = Column(Integer, nullable=False)



  Base = declarative_base()


In [9]:
import numpy as np
from scipy.sparse import csr_matrix, identity
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
import pandas as pd

from logger import logging


class EASE():
    def __init__(self):
        self.enc = LabelEncoder()
        
    def predict(self, interactions, k=10, item='movieId', value='rating'):
        interactions = pd.DataFrame(interactions)

        interactions = interactions[interactions[item].isin(self.enc.classes_)]
        if interactions.empty:
            return list()

        items = self.enc.transform(interactions[item])
        X = np.zeros(len(self.enc.classes_))
        X[items] = interactions[value].values

        pred = X[None] @ self.w
        pred = pred[0]
        pred[items] = -1000000000

        top_ids = np.argpartition(pred, -k)[-k:]

        # top_items = self.enc.inverse_transform(top_ids)
        # top_scores = pred[top_ids]
        # sorted_top = top_items[np.argsort(-top_scores)]
        # 
        # return list(sorted_top), X[items], top_scores
        top_items = self.enc.inverse_transform(top_ids)
        top_scores = pred[top_ids]
        sorted_indices = np.argsort(-top_scores)
        sorted_top_items = top_items[sorted_indices]
        sorted_top_scores = top_scores[sorted_indices]
    
        return list(sorted_top_items), list(sorted_top_scores), X[items]

    def fit(self, data, user='userId', item='movieId', value='rating'):
        logging.info('Training model')

        items = self.enc.fit_transform(data[item])
        users = data[user].rank(method='dense').values.astype(int)
        values = data[value].values

        X = csr_matrix((values, (users, items))).astype(np.float32)

        G = X.T @ X
        G += 500 * identity(G.shape[0])
        G = G.todense()

        P = np.linalg.inv(G)
        B = -P / np.diag(P)
        np.fill_diagonal(B, 0.)

        self.w = B
        self.w = np.array(self.w)

        logging.info('Trained')
    def save(self):
        cat = np.concatenate([self.enc.classes_[None], self.w])
        np.save('w.npy', cat)
        logging.info('Start uploading')
        
        logging.info('Uploaded')
    def load(self):
        t = np.load('w.npy')
        self.enc.classes_ = t[0].astype(int)
        self.w = t[1:]
        return


In [10]:
recommender = EASE()

In [11]:

from sqlalchemy import func
import pandas as pd
import logging
def generate_fit_data():

    logging.info('Gathering user_ids')
    user_ids_with_high_ratings = (
        session.query(Rating.userId)
        .filter(Rating.rating > 3)
        .group_by(Rating.userId)
        .having(func.count(Rating.id) >= 30)
        .limit(50000)
    )

    user_ids = [user_id for (user_id,) in user_ids_with_high_ratings]
    logging.info('Gathered user_ids, gathering ratings')
    ratings = session.query(Rating).filter(Rating.rating > 3).filter(Rating.userId.in_(user_ids)).all()
    logging.info('Gathered ratings, formatting data')
    data_to_predict = {
        'userId': [],
        'movieId': [],
        'rating': []
    }
    for rating in ratings:
        data_to_predict['userId'].append(rating.userId)
        data_to_predict['movieId'].append(rating.movieId)
        data_to_predict['rating'].append(rating.rating)
    logging.info('Formatted, returning')

    return pd.DataFrame(data_to_predict)

In [12]:
def split_dataframe_70_30(df):
  split_index = int(len(df) * 0.7)  # Calculate index for 70% split (approximately)
  df1 = df.iloc[:split_index]
  df2 = df.iloc[split_index:]
  return df1, df2

In [13]:
train_data = generate_fit_data()

recommender.fit(train_data)

[16/May/2024 16:18:10] INFO - Gathering user_ids
[16/May/2024 16:20:09] INFO - Gathered user_ids, gathering ratings
[16/May/2024 16:21:15] INFO - Gathered ratings, formatting data
[16/May/2024 16:21:20] INFO - Formatted, returning
[16/May/2024 16:21:29] INFO - Training model
[16/May/2024 16:23:56] INFO - Trained


In [14]:
recommender.save()

[16/May/2024 16:24:06] INFO - Start uploading
[16/May/2024 16:24:06] INFO - Uploaded


In [15]:
recommender.load()

In [16]:
trained_user_ids = train_data['userId'].unique()
trained_user_ids

array([    1,     2,     3, ..., 80428, 80430, 80431], dtype=int64)

In [17]:
def not_trained_users(trained_user_ids):

    logging.info('Gathering user_ids')
    user_ids_with_high_ratings = (
        session.query(Rating.userId)
        .filter(Rating.rating > 3)
        .filter(Rating.userId.notin_(trained_user_ids))
        .group_by(Rating.userId)
        .having(func.count(Rating.id) >= 30)
        .limit(50)
    )
    
    user_ids = [user_id for (user_id,) in user_ids_with_high_ratings]
    return user_ids
    
def get_user_ratings(user_id):
    # logging.info('Gathered user_ids, gathering ratings')
    ratings = session.query(Rating).filter(Rating.userId == user_id).all()
    logging.info('Gathered ratings, formatting data')
    data_to_predict = {
        # 'userId': [],
        'movieId': [],
        'rating': []
    }
    for rating in ratings:
        # data_to_predict['userId'].append(rating.userId)
        data_to_predict['movieId'].append(rating.movieId)
        data_to_predict['rating'].append(rating.rating)
    logging.info('Formatted, returning')

    return pd.DataFrame(data_to_predict)

In [18]:
user_ids = not_trained_users(trained_user_ids)
user_ids

[16/May/2024 16:24:08] INFO - Gathering user_ids


[80432,
 80434,
 80436,
 80437,
 80439,
 80442,
 80444,
 80445,
 80446,
 80448,
 80449,
 80450,
 80451,
 80456,
 80459,
 80460,
 80461,
 80462,
 80465,
 80466,
 80468,
 80469,
 80470,
 80473,
 80475,
 80476,
 80479,
 80481,
 80482,
 80483,
 80484,
 80485,
 80488,
 80492,
 80493,
 80494,
 80495,
 80496,
 80498,
 80499,
 80501,
 80505,
 80510,
 80512,
 80513,
 80514,
 80515,
 80516,
 80517,
 80520]

In [19]:
user_id = user_ids[0]
user_ratings = get_user_ratings(user_id)
max_id = user_ratings['movieId'].max()
max_id
predict_score, test_score = split_dataframe_70_30(user_ratings)
predict_score

[16/May/2024 16:26:34] INFO - Gathered ratings, formatting data
[16/May/2024 16:26:34] INFO - Formatted, returning


Unnamed: 0,movieId,rating
0,25,5.0
1,39,4.0
2,58,4.0
3,247,5.0
4,260,4.0
...,...,...
65,1961,4.0
66,2020,4.0
67,2028,5.0
68,2064,4.0


In [20]:
test_score

Unnamed: 0,movieId,rating
70,2132,4.0
71,2186,4.0
72,2204,4.0
73,2208,4.0
74,2243,4.0
75,2303,5.0
76,2396,4.0
77,2398,3.0
78,2455,4.0
79,2729,4.0


In [68]:
prediction = recommender.predict(predict_score, k=1500)
prediction[0]

[357,
 260,
 2396,
 2291,
 515,
 539,
 1183,
 1968,
 58,
 1225,
 912,
 1639,
 838,
 2020,
 2193,
 345,
 471,
 2248,
 1394,
 2144,
 25,
 253,
 1275,
 39,
 1408,
 222,
 1569,
 1247,
 151,
 1259,
 440,
 1249,
 232,
 543,
 1357,
 920,
 1188,
 224,
 1721,
 2502,
 3108,
 339,
 1380,
 1270,
 1680,
 1278,
 2243,
 1059,
 377,
 916,
 1967,
 541,
 2406,
 1704,
 1277,
 50,
 1257,
 898,
 6711,
 446,
 534,
 2125,
 1711,
 1101,
 1290,
 2355,
 924,
 1674,
 171,
 1240,
 2324,
 2959,
 28,
 1611,
 527,
 1321,
 1250,
 2340,
 802,
 1542,
 587,
 1198,
 1541,
 1748,
 2134,
 1009,
 1224,
 555,
 1683,
 104,
 3210,
 272,
 36,
 105,
 1222,
 2140,
 356,
 923,
 2721,
 21,
 1673,
 2297,
 2690,
 1784,
 5782,
 1343,
 705,
 899,
 1416,
 1914,
 1235,
 551,
 477,
 1093,
 2542,
 2729,
 337,
 1535,
 2065,
 2105,
 5060,
 1219,
 46,
 289,
 1285,
 930,
 111,
 1177,
 2424,
 2282,
 1617,
 2321,
 3578,
 597,
 1185,
 1023,
 1300,
 4226,
 2067,
 1732,
 364,
 3793,
 1912,
 2644,
 1620,
 3448,
 4014,
 2080,
 1597,
 1645,
 1347,
 60

In [69]:
if max(prediction[0]) > max_id:
    max_id = max(prediction[0])

In [70]:
prediction[1]

[2.297307410175376,
 2.2284900170265685,
 1.8192738291905863,
 1.7195191586962513,
 1.5883777639968395,
 1.5650667191695327,
 1.5235502187717684,
 1.4472032182235226,
 1.4457193969372122,
 1.3995825556105526,
 1.3824892999546277,
 1.317597181126525,
 1.2342305304478554,
 1.2131139727957663,
 1.210336810190768,
 1.1760685543480287,
 1.1700314161788472,
 1.1504882600308888,
 1.138667746994521,
 1.1202575231309804,
 1.110961042303709,
 1.089044688482488,
 1.079206389855414,
 1.0592759247610697,
 1.0504307897612133,
 1.0232126117044853,
 1.020897403345449,
 1.0204335776244946,
 1.0100989233154938,
 1.0067779761843902,
 1.0037100859629937,
 0.9896085214546826,
 0.9735474235919144,
 0.9616382340386411,
 0.9314533112090235,
 0.9237630680704316,
 0.9122110043136736,
 0.9074520302173722,
 0.9069376620146103,
 0.9035483299332583,
 0.8987792966361424,
 0.8888835090235561,
 0.8784187430588819,
 0.8596371529596813,
 0.8481034720622185,
 0.843059231162532,
 0.8403718952436655,
 0.8207555124214971,
 

In [71]:
prediction[2]

array([4., 5., 4., 5., 5., 4., 5., 5., 4., 4., 5., 3., 5., 2., 5., 5., 3.,
       5., 5., 3., 2., 4., 5., 4., 4., 5., 4., 4., 4., 4., 4., 4., 5., 4.,
       2., 4., 3., 4., 4., 5., 5.])

In [72]:
predict_zero_array = np.zeros(max_id)
for i in range(len(prediction[1])):
    predict_zero_array[prediction[0][i] - 1] = prediction[1][i]
predict_zero_array

array([0.32071141, 0.2812265 , 0.3852114 , ..., 0.        , 0.        ,
       0.10032579])

In [73]:
test_zero_array = np.zeros(max_id)
for ind in test_score.index:
        test_zero_array[test_score["movieId"][ind] - 1] = test_score["rating"][ind]
        print(f'movie: {test_score["movieId"][ind]},rating: {test_score["rating"][ind]}')

movie: 2243,rating: 4.0
movie: 2248,rating: 5.0
movie: 2291,rating: 5.0
movie: 2369,rating: 5.0
movie: 2396,rating: 4.0
movie: 2424,rating: 4.0
movie: 2502,rating: 5.0
movie: 2671,rating: 4.0
movie: 2690,rating: 5.0
movie: 2702,rating: 1.0
movie: 2706,rating: 2.0
movie: 2718,rating: 5.0
movie: 2797,rating: 4.0
movie: 3108,rating: 4.0
movie: 3155,rating: 4.0
movie: 3259,rating: 4.0
movie: 3261,rating: 4.0
movie: 3426,rating: 4.0


In [74]:
test_zero_array

array([0., 0., 0., ..., 0., 0., 0.])

In [75]:
accuracy = ndcg_score([test_zero_array], [predict_zero_array])
accuracy

0.44840683843809365

In [76]:
for i in range(max_id):
    if (test_zero_array[i] == 0 or predict_zero_array[i] == 0) and predict_zero_array[i] != test_zero_array[i]:
        test_zero_array[i] = 0
        predict_zero_array[i] = 0

In [77]:
test_zero_array

array([0., 0., 0., ..., 0., 0., 0.])

In [78]:
predict_zero_array

array([0., 0., 0., ..., 0., 0., 0.])

In [79]:
accuracy = ndcg_score([test_zero_array], [predict_zero_array])
accuracy

0.9691223269921904

In [80]:
without_zeros_test = []
for i in range(max_id):
    if test_zero_array[i] != 0:
        without_zeros_test.append(test_zero_array[i])
without_zeros_test

[4.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.0, 4.0, 4.0]

In [81]:
without_zeros_predict = []
for i in range(max_id):
    if predict_zero_array[i] != 0:
        without_zeros_predict.append(predict_zero_array[i])
without_zeros_predict

[0.8403718952436655,
 1.1504882600308888,
 1.7195191586962513,
 0.3640336433753488,
 1.8192738291905863,
 0.45121418892387627,
 0.9035483299332583,
 0.3438950642171954,
 0.5068625175023045,
 0.33128262677441195,
 0.8987792966361424,
 0.20018674210800305,
 0.27349746923592066]

In [82]:
accuracy_without_zeros = ndcg_score([without_zeros_test], [without_zeros_predict])
accuracy_without_zeros

0.9691223269921905

In [83]:
ndcg_score([without_zeros_test], [without_zeros_test])

1.0

In [92]:
accuracies = []
for user_id in user_ids:
    user_ratings = get_user_ratings(user_id)
    max_id = user_ratings['movieId'].max()
    
    predict_score, test_score = split_dataframe_70_30(user_ratings)

    prediction = recommender.predict(predict_score, k=500)
    if max(prediction[0]) > max_id:
        max_id = max(prediction[0])
        
    predict_zero_array = np.zeros(max_id)
    test_zero_array = np.zeros(max_id)
    # predict_zero_array = np.zeros(max_id)
    for i in range(len(prediction[1])):
        predict_zero_array[prediction[0][i] - 1] = prediction[1][i]
    #достать из предикшна айдишники фильмов, поставить на индексы равные айдишникам фильмов их соответственный рейтинг

    # for ind in test_score.index:
    #     test_zero_array[test_score["movieId"][ind]] = test_score["rating"][ind]
    for ind in test_score.index:
        test_zero_array[test_score["movieId"][ind] - 1] = test_score["rating"][ind]
        # print(f'movie: {test_score["movieId"][ind]},rating: {test_score["rating"][ind]}')
    
    accuracy = ndcg_score([test_zero_array], [predict_zero_array])
    accuracies.append(accuracy)
    print(user_id,'-', accuracy)

[16/May/2024 16:41:23] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:23] INFO - Formatted, returning
80432 - 0.3187901636077012
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80434 - 0.35525756060555475
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80436 - 0.32714556400512085
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80437 - 0.3930476971870292
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80439 - 0.42729542004493964
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80442 - 0.43092231934479136
[16/May/2024 16:41:24] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:24] INFO - Formatted, returning
80444 - 0.3314531208581765
[1

In [93]:
print(np.average(np.array(accuracies)))

0.39274183864428075


In [94]:
accuracies_with_concurrencies = []
for user_id in user_ids:
    user_ratings = get_user_ratings(user_id)
    max_id = user_ratings['movieId'].max()
    
    predict_score, test_score = split_dataframe_70_30(user_ratings)

    prediction = recommender.predict(predict_score, k=500)
    if max(prediction[0]) > max_id:
        max_id = max(prediction[0])
        
    predict_zero_array = np.zeros(max_id)
    test_zero_array = np.zeros(max_id)
    # predict_zero_array = np.zeros(max_id)
    for i in range(len(prediction[1])):
        predict_zero_array[prediction[0][i] - 1] = prediction[1][i]
    #достать из предикшна айдишники фильмов, поставить на индексы равные айдишникам фильмов их соответственный рейтинг

    # for ind in test_score.index:
    #     test_zero_array[test_score["movieId"][ind]] = test_score["rating"][ind]
    for ind in test_score.index:
        test_zero_array[test_score["movieId"][ind] - 1] = test_score["rating"][ind]
        # print(f'movie: {test_score["movieId"][ind]},rating: {test_score["rating"][ind]}')
    
    for i in range(max_id):
        if (test_zero_array[i] == 0 or predict_zero_array[i] == 0) and predict_zero_array[i] != test_zero_array[i]:
            test_zero_array[i] = 0
            predict_zero_array[i] = 0
    
    accuracy = ndcg_score([test_zero_array], [predict_zero_array])
    accuracies_with_concurrencies.append(accuracy)
    print(user_id,'-', accuracy)

[16/May/2024 16:41:38] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:38] INFO - Formatted, returning
80432 - 0.9770487554672748
[16/May/2024 16:41:38] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:38] INFO - Formatted, returning
80434 - 0.9490999809806189
[16/May/2024 16:41:38] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:38] INFO - Formatted, returning
80436 - 0.929155906857973
[16/May/2024 16:41:39] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:39] INFO - Formatted, returning
80437 - 0.958130583695234
[16/May/2024 16:41:39] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:39] INFO - Formatted, returning
80439 - 0.9234460791145827
[16/May/2024 16:41:39] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:39] INFO - Formatted, returning
80442 - 0.9202570874329382
[16/May/2024 16:41:39] INFO - Gathered ratings, formatting data
[16/May/2024 16:41:39] INFO - Formatted, returning
80444 - 0.89245802130973
[16/May/20

In [95]:
print(np.average(np.array(accuracies_with_concurrencies)))

0.9451849123595405
