In [41]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv('movies_metadata.csv')
links_small = pd.read_csv('links_small.csv')

ratings_small = pd.read_csv('ratings_small.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: x.year if x != np.nan else np.nan)

In [5]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

# Рекомендашка на основе описания фильма и его тегов

In [6]:
#Check EDA Notebook for how and why I got these indices.
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')

In [7]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [8]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [9]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [10]:
tfidf_matrix.shape

(9099, 268124)

In [11]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [15]:
get_recommendations('Taxi').head(10)

3776                  Bandits
3623                 Loverboy
7838       30 Minutes or Less
763        To Be or Not to Be
849               Normal Life
112            The Star Maker
4274                   Xanadu
4276               Banana Joe
6023    Employee of the Month
1402          The Newton Boys
Name: title, dtype: object

In [16]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

md['id'] = md['id'].astype('int')
md.shape

(45463, 25)

In [17]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [18]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [19]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [20]:
smd['director'] = smd['crew'].apply(get_director)

In [21]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [22]:
a = smd['director'].iloc[1]

In [23]:
smd['cast'] = smd['cast'].apply(
    lambda x: [i.lower().replace(" ", "") for i in x] if isinstance(x, list) else x
)

In [24]:
smd['director'] = smd['director'].astype(str)
smd.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int32
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
year                     float64
cast                      object
crew                      object
keywords                  object
cast_size                  int64
crew_size                  int64
director  

In [25]:
# smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [26]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [27]:
s = s[s > 1]

In [28]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [29]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [30]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [31]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [32]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [33]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [34]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [35]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [36]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C
5.244896612406511
m = vote_counts.quantile(0.95)
m

425.0

In [37]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [80]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['id','title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified['id'].tolist()

In [81]:
improved_recommendations('Avatar')

[597, 127585, 54138, 280, 218, 679, 2756, 49521, 36955, 76757]

# Коллаборативная фильтрация

In [43]:
reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

best_params_rmse = {'lr_all': 0.012953500641312315, 'n_epochs': 44, 'n_factors': 91, 'reg_all': 0.09408867005221243}
best_params_mae = {'lr_all': 0.012953500641312315, 'n_epochs': 44, 'n_factors': 91, 'reg_all': 0.09408867005221243}

# Инициализация SVD с лучшими параметрами (RMSE)
svd = SVD(
    lr_all=best_params_rmse['lr_all'],
    n_epochs=best_params_rmse['n_epochs'],
    n_factors=best_params_rmse['n_factors'],
    reg_all=best_params_rmse['reg_all']
)

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()

prediction = svd.predict(1, 302, 3)
print(f"SVD Prediction for user 1 and movie 302: {prediction.est:.2f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8800  0.8764  0.8730  0.8746  0.8713  0.8750  0.0030  
MAE (testset)     0.6771  0.6720  0.6695  0.6720  0.6717  0.6724  0.0025  
Fit time          3.03    2.80    2.09    2.14    2.35    2.48    0.37    
Test time         0.28    0.18    0.38    0.17    0.14    0.23    0.09    
SVD Prediction for user 1 and movie 302: 2.77


# Гибридный рекомендатель

Создаем new_df с id фильмов и их названиями

In [93]:
import pandas as pd

# Замените 'film.csv' на реальный путь к вашему файлу
file_path = 'film.csv'  

try:
    # Читаем CSV файл в DataFrame
    df = pd.read_csv(file_path)

    # Проверяем, существуют ли столбцы 'id' и 'title'
    if 'id' not in df.columns or 'title' not in df.columns:
        raise ValueError("Столбцы 'id' и/или 'title' отсутствуют в файле.")

    # Создаем новый DataFrame с выбранными столбцами
    new_df = df[['id', 'title']]

    # Выводим новый DataFrame
    print(new_df)

except FileNotFoundError:
    print(f"Файл {file_path} не найден.")
except pd.errors.EmptyDataError:
    print(f"Файл {file_path} пуст.")
except pd.errors.ParserError:
    print(f"Ошибка при разборе файла {file_path}. Проверьте формат файла.")
except ValueError as e:
    print(e)
except Exception as e:
    print(f"Произошла непредвиденная ошибка: {e}")

           id                        title
0         862                    Toy Story
1        8844                      Jumanji
2       15602             Grumpier Old Men
3       31357            Waiting to Exhale
4       11862  Father of the Bride Part II
...       ...                          ...
45461  439050                       Subdue
45462  111109          Century of Birthing
45463   67758                     Betrayal
45464  227506             Satan Triumphant
45465  461257                     Queerama

[45466 rows x 2 columns]


Ищем для конкретного пользователя фильмы с рейтингом >= 4

In [94]:
def get_high_rated_movies(user_id, ratings_file='ratings_small.csv', rating_threshold=4):
    try:
        ratings = pd.read_csv(ratings_file)
        reader = Reader()
        data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

        user_ratings = ratings[ratings['userId'] == user_id]

        if user_ratings.empty:
            return None # Пользователь не найден

        high_rated_movies = user_ratings[user_ratings['rating'] >= rating_threshold]['movieId'].tolist()
        return high_rated_movies

    except FileNotFoundError:
        print(f"Файл {ratings_file} не найден.")
        return None
    except Exception as e:
        print(f"Произошла ошибка: {e}")
        return None


user_id_to_check = 3
high_rated = get_high_rated_movies(user_id_to_check)

if high_rated:
    print(f"Фильмы с рейтингом 4 и выше для пользователя {user_id_to_check}: {high_rated}")
else:
    print(f"Пользователь {user_id_to_check} не найден или произошла ошибка.")

Фильмы с рейтингом 4 и выше для пользователя 3: [110, 296, 318, 356, 778, 1197, 1235, 1378, 1721, 1884, 2028, 2318, 2841, 2858, 2959, 3510, 3949, 48783, 50068, 84236]


Сопоставляем id фильмов и их названия, чтобы воспользоваться improved_recommendations 

In [95]:
high_rated_df = pd.DataFrame({'id': [str(x) for x in high_rated]})
merged_df = pd.merge(high_rated_df, new_df, on='id', how='left')
merged_df = merged_df.dropna(subset=['title']) # Удаляем строки с NaN в столбце 'title'
titles = merged_df['title'].tolist()
print(titles)

['Three Colors: Red', 'Terminator 3: Rise of the Machines', 'The Million Dollar Hotel', "Monsieur  Hulot's Holiday", 'Shortbus', 'All the Way Boys', 'The Ewok Adventure', 'Say Anything...', 'A Very Long Engagement', 'License to Wed']


Для каждого фильма применяем и находим похожие фильмы (их id)

In [96]:
all_recommendations = []
for title in titles:
    try:
        recommendations = improved_recommendations(title)
        all_recommendations.extend(recommendations)
    except KeyError:
        print(f"Фильм '{title}' не найден в индексе. Проверьте данные.")

print(all_recommendations)

Фильм 'All the Way Boys' не найден в индексе. Проверьте данные.
Фильм 'The Ewok Adventure' не найден в индексе. Проверьте данные.
[228970, 18320, 108, 109, 1600, 2786, 15383, 62204, 11673, 8072, 603, 280, 218, 6479, 49530, 604, 605, 20504, 534, 87101, 197, 1579, 655, 144, 910, 1266, 2086, 10502, 17707, 507, 771, 850, 772, 10437, 10719, 10227, 11395, 13680, 13767, 9745, 258480, 639, 1544, 13403, 1909, 15708, 11240, 24137, 38291, 4599, 786, 1903, 9390, 74465, 9621, 27583, 11068, 16643, 1901, 222936, 194, 228970, 18320, 269, 892, 902, 2786, 10222, 27936, 8078, 6557, 10184, 712, 8874, 11003, 9779, 10030, 9530, 9574, 10521]


Для каждого найденого фильма применяем svd.predict(user_id_to_check, movie_id) и если рейтинг получился >= 3, то записываем фильм в рекомендованые и затем выводим все id таких фильмов по убыванию их рейтингов

In [97]:
high_rated_movies = {} # Словарь для хранения ID и предсказанных рейтингов

for movie_id in all_recommendations:
    try:
        prediction = svd.predict(user_id_to_check, movie_id)
        
        if prediction.est >= 3:
            high_rated_movies[movie_id] = prediction.est # Сохраняем ID и предсказанный рейтинг
    except Exception as e:
        print(f"Ошибка при предсказании для фильма {movie_id}: {e}")


# Создаем DataFrame из словаря
high_rated_df = pd.DataFrame.from_dict(high_rated_movies, orient='index', columns=['predicted_rating'])
high_rated_df['movieId'] = high_rated_df.index

# Сортировка по предсказанному рейтингу
high_rated_df = high_rated_df.sort_values('predicted_rating', ascending=False)

print(high_rated_df)

# Вывод только movieId
print("\nMovie IDs sorted by predicted rating:")
print(high_rated_df['movieId'].tolist())


       predicted_rating  movieId
850            4.113507      850
194            4.005161      194
49530          3.927674    49530
8874           3.867486     8874
280            3.779778      280
...                 ...      ...
639            3.311217      639
902            3.309376      902
786            3.206288      786
507            3.182167      507
1544           3.058656     1544

[76 rows x 2 columns]

Movie IDs sorted by predicted rating:
[850, 194, 49530, 8874, 280, 534, 1903, 910, 108, 269, 605, 603, 1266, 16643, 11068, 27583, 228970, 9621, 1901, 9390, 4599, 38291, 24137, 11240, 74465, 10222, 222936, 13403, 27936, 8078, 6557, 10184, 712, 11003, 9779, 10030, 9530, 9574, 15708, 9745, 10521, 8072, 604, 20504, 87101, 197, 11673, 1579, 655, 62204, 2086, 10502, 17707, 15383, 771, 772, 10437, 10719, 10227, 11395, 13680, 13767, 18320, 109, 258480, 6479, 2786, 892, 144, 218, 1909, 639, 902, 786, 507, 1544]


Удаляем все просмотренные фильмы пользователем из текущего списка

In [105]:
high_rated_df = high_rated_df[~high_rated_df['movieId'].isin(high_rated)]
print(high_rated_df['movieId'].tolist())

[850, 194, 49530, 8874, 280, 534, 1903, 910, 108, 269, 605, 603, 1266, 16643, 11068, 27583, 228970, 9621, 1901, 9390, 4599, 38291, 24137, 11240, 74465, 10222, 222936, 13403, 27936, 8078, 6557, 10184, 712, 11003, 9779, 10030, 9530, 9574, 15708, 9745, 10521, 8072, 604, 20504, 87101, 197, 11673, 1579, 655, 62204, 2086, 10502, 17707, 15383, 771, 772, 10437, 10719, 10227, 11395, 13680, 13767, 18320, 109, 258480, 6479, 2786, 892, 144, 218, 1909, 639, 902, 786, 507, 1544]
