In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest.zip -O MovieLens.zip

--2024-04-12 17:23:23--  https://files.grouplens.org/datasets/movielens/ml-latest.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 350896731 (335M) [application/zip]
Saving to: ‘MovieLens.zip’


2024-04-12 17:23:30 (47.1 MB/s) - ‘MovieLens.zip’ saved [350896731/350896731]



In [None]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
   creating: ml-latest/
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/links.csv     
  inflating: ml-latest/README.txt    
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


In [None]:
movies = pd.read_csv('/content/ml-latest/movies.csv')
ratings = pd.read_csv('/content/ml-latest/ratings.csv')
tags = pd.read_csv('/content/ml-latest/tags.csv')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<86537x20 sparse matrix of type '<class 'numpy.float64'>'
	with 152288 stored elements in Compressed Sparse Row format>

In [None]:
generes = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
generes['title'] = movies['title']

In [None]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,499,animation,1422605756
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,499,friendship,1422605756
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,499,toys,1422605756
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1114,animation,1147410824
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1114,Disney,1147449942


In [None]:
movies_with_tags.tag.unique().shape

(153950,)

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

(53391,)

In [None]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').replace('=', '').lower()

tag_strings = []
list_movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    list_movies.append(movie)

  0%|          | 0/53391 [00:00<?, ?it/s]

In [None]:
#Для ускорения работы, будем использовать только 500 фильмов


list_of_movie = list_movies[10:510]

In [None]:
tag_strings = tag_strings[10:510]

In [None]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<500x6864 sparse matrix of type '<class 'numpy.float64'>'
	with 11685 stored elements in Compressed Sparse Row format>

In [None]:
all_tags = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())

In [None]:
all_tags['title'] = list_of_movie

In [None]:
all_tags

Unnamed: 0,01,02,03,05,06,07,08,09,10,100essentialfemaleperformances,...,zombieoutbreak,zombies,zombiesurvival,zombieviolence,zombification,zoo,zooeydeschanel,zoomzoom,zootsuit,title
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#Horror (2015)
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#SCREAMERS (2016)
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#Stuck (2014)
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#UNFIT: The Psychology of Donald Trump (2019)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#realityhigh (2017)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4 Horror Tales - February 29 (2006)
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4 Horror Tales - Hidden Floor (2006)
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4 Horror Tales - Roommates (2006)
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"4 Horsemen of the Apocalypse, The (1962)"


In [None]:
all_tags['title']

0                                     #Horror (2015)
1                                  #SCREAMERS (2016)
2                                      #Stuck (2014)
3      #UNFIT: The Psychology of Donald Trump (2019)
4                                #realityhigh (2017)
                           ...                      
495              4 Horror Tales - February 29 (2006)
496             4 Horror Tales - Hidden Floor (2006)
497                4 Horror Tales - Roommates (2006)
498         4 Horsemen of the Apocalypse, The (1962)
499                    4 Horsemen: Apocalypse (2022)
Name: title, Length: 500, dtype: object

In [None]:
# Очистим мало информативные теги
non_zero_count = all_tags[all_tags != 0].count()

# Удаление столбцов, в которых количество ненулевых значений меньше шесть
all_tags = all_tags.drop(non_zero_count[non_zero_count < 6].index, axis=1)

In [None]:
all_tags

Unnamed: 0,01,03,06,10,11,1970s,1980s,2000s,acting,action,...,undercover,usa,violence,voiceovernarration,war,womandirector,writer,xxcriterioncollection,youtube,title
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,#Horror (2015)
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,#SCREAMERS (2016)
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,#Stuck (2014)
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,#UNFIT: The Psychology of Donald Trump (2019)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,#realityhigh (2017)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4 Horror Tales - February 29 (2006)
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4 Horror Tales - Hidden Floor (2006)
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4 Horror Tales - Roommates (2006)
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.319759,0.0,0.0,0.0,0.0,"4 Horsemen of the Apocalypse, The (1962)"


In [None]:
generes = generes[generes['title'].isin(list_of_movie)]

In [None]:
generes

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western,title
641,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.674173,0.0,0.000000,0.000000,0.496090,0.0,0.0,"301, 302 (301/302) (1995)"
763,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.526215,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.850352,0.000000,0.000000,0.0,0.0,'Til There Was You (1997)
903,0.0,0.657053,0.0,0.000000,0.000000,0.000000,0.0,0.337535,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.674056,0.000000,0.0,0.0,2001: A Space Odyssey (1968)
944,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.358147,0.0,0.000000,...,0.0,0.0,0.752009,0.0,0.000000,0.000000,0.553366,0.0,0.0,"39 Steps, The (1935)"
976,0.0,0.000000,0.0,0.000000,0.000000,0.477085,0.0,0.000000,0.0,0.878857,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,2 Days in the Valley (1996)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82346,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,24/7 - The Passion of Life (2006)
84533,0.0,0.000000,0.0,0.000000,0.515084,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.857140,0.000000,0.0,0.0,2069: A Sex Odyssey (1974)
84839,0.0,0.000000,0.0,0.795532,0.465484,0.000000,0.0,0.387883,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,13: The Musical (2022)
85108,0.0,0.000000,0.0,0.000000,0.000000,0.615103,0.0,0.339016,0.0,0.000000,...,0.0,0.0,0.711840,0.0,0.000000,0.000000,0.000000,0.0,0.0,18 Kilohertz (2020)


In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [None]:
average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()

In [None]:
average_ratings.rename(columns={'rating': 'average_ratings'}, inplace=True)

In [None]:
full_ratings = ratings[['userId', 'rating', 'movieId']]

In [None]:
full_ratings

Unnamed: 0,userId,rating,movieId
0,1,4.0,1
1,1,4.0,110
2,1,4.0,158
3,1,4.5,260
4,1,5.0,356
...,...,...,...
33832157,330975,2.0,8340
33832158,330975,2.5,8493
33832159,330975,4.0,8622
33832160,330975,3.0,8665


In [None]:
full_ratings = pd.merge(full_ratings, average_ratings, on='movieId', how='left')

In [None]:
full_ratings

Unnamed: 0,userId,rating,movieId,average_ratings
0,1,4.0,1,3.893508
1,1,4.0,110,3.996166
2,1,4.0,158,2.888675
3,1,4.5,260,4.092400
4,1,5.0,356,4.068189
...,...,...,...,...
33832157,330975,2.0,8340,3.778485
33832158,330975,2.5,8493,3.565872
33832159,330975,4.0,8622,3.474033
33832160,330975,3.0,8665,3.850105


In [None]:
movies = movies[movies['title'].isin(list_of_movie)]

In [None]:
movies

Unnamed: 0,movieId,title,genres
641,652,"301, 302 (301/302) (1995)",Horror|Mystery|Thriller
763,779,'Til There Was You (1997),Drama|Romance
903,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
944,965,"39 Steps, The (1935)",Drama|Mystery|Thriller
976,999,2 Days in the Valley (1996),Crime|Film-Noir
...,...,...,...
82346,276549,24/7 - The Passion of Life (2006),Drama
84533,282267,2069: A Sex Odyssey (1974),Comedy|Sci-Fi
84839,283033,13: The Musical (2022),Children|Comedy|Drama
85108,283705,18 Kilohertz (2020),Crime|Drama|Mystery


In [None]:
full_ratings = full_ratings[full_ratings['movieId'].isin(movies['movieId'])]

In [None]:
full_ratings

Unnamed: 0,userId,rating,movieId,average_ratings
23,1,3.5,2085,3.428957
460,7,4.0,3534,3.089913
751,10,3.5,51662,3.554326
811,10,3.0,103883,3.330183
898,12,3.0,1367,3.046248
...,...,...,...,...
33831772,330974,0.5,56949,3.147814
33831784,330974,1.5,72378,2.756709
33831920,330975,3.0,924,3.996565
33832029,330975,1.0,2085,3.428957


In [None]:
movies_with_tags = movies_with_tags[movies_with_tags['title'].isin(list_of_movie)]

In [None]:
movies_with_tags

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
131784,652,"301, 302 (301/302) (1995)",Horror|Mystery|Thriller,215490,anorexic,1527403905
131785,652,"301, 302 (301/302) (1995)",Horror|Mystery|Thriller,215490,chef,1527403905
131786,652,"301, 302 (301/302) (1995)",Horror|Mystery|Thriller,215490,neighbor,1527403905
142505,779,'Til There Was You (1997),Drama|Romance,215490,addiction,1528351593
142506,779,'Til There Was You (1997),Drama|Romance,215490,apartment,1528351593
...,...,...,...,...,...,...
2326452,284575,091: Police speaking (1960),Crime|Drama,126357,police,1677897653
2326453,284575,091: Police speaking (1960),Crime|Drama,126357,police car,1677897648
2326454,284575,091: Police speaking (1960),Crime|Drama,126357,police officer,1677897644
2326455,284575,091: Police speaking (1960),Crime|Drama,126357,Spain,1677897686


In [None]:
cols = ['movieId', 'title']
test = movies[cols]

In [None]:
test

Unnamed: 0,movieId,title
641,652,"301, 302 (301/302) (1995)"
763,779,'Til There Was You (1997)
903,924,2001: A Space Odyssey (1968)
944,965,"39 Steps, The (1935)"
976,999,2 Days in the Valley (1996)
...,...,...
82346,276549,24/7 - The Passion of Life (2006)
84533,282267,2069: A Sex Odyssey (1974)
84839,283033,13: The Musical (2022)
85108,283705,18 Kilohertz (2020)


In [None]:
result = pd.merge(test, generes, on='title', how='left')
del test

In [None]:
result

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,652,"301, 302 (301/302) (1995)",0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.547161,0.0,0.0,0.674173,0.0,0.000000,0.000000,0.496090,0.0,0.0
1,779,'Til There Was You (1997),0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.526215,...,0.000000,0.0,0.0,0.000000,0.0,0.850352,0.000000,0.000000,0.0,0.0
2,924,2001: A Space Odyssey (1968),0.0,0.657053,0.0,0.000000,0.000000,0.000000,0.0,0.337535,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.674056,0.000000,0.0,0.0
3,965,"39 Steps, The (1935)",0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.358147,...,0.000000,0.0,0.0,0.752009,0.0,0.000000,0.000000,0.553366,0.0,0.0
4,999,2 Days in the Valley (1996),0.0,0.000000,0.0,0.000000,0.000000,0.477085,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,276549,24/7 - The Passion of Life (2006),0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
496,282267,2069: A Sex Odyssey (1974),0.0,0.000000,0.0,0.000000,0.515084,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.857140,0.000000,0.0,0.0
497,283033,13: The Musical (2022),0.0,0.000000,0.0,0.795532,0.465484,0.000000,0.0,0.387883,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
498,283705,18 Kilohertz (2020),0.0,0.000000,0.0,0.000000,0.000000,0.615103,0.0,0.339016,...,0.000000,0.0,0.0,0.711840,0.0,0.000000,0.000000,0.000000,0.0,0.0


In [None]:
result = pd.merge(result, all_tags, on='title', how='left')

In [None]:
result.shape

(500, 272)

In [None]:
data = pd.merge(full_ratings, result, on='movieId', how='left')

In [None]:
data = data.drop(['userId', 'movieId', 'title'], axis=1)

In [None]:
data

Unnamed: 0,rating,average_ratings,action_x,adventure_x,animation,children_x,comedy_x,crime_x,documentary_x,drama_x,...,truestory,undercover,usa,violence,voiceovernarration,war_y,womandirector,writer,xxcriterioncollection,youtube
0,3.5,3.428957,0.000000,0.559550,0.582532,0.589542,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0
1,4.0,3.089913,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0
2,3.5,3.554326,0.322021,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.007316,0.004918,0.21146,0.000000,0.000000,0.0,0.0
3,3.0,3.330183,0.603800,0.000000,0.000000,0.000000,0.439753,0.664863,0.0,0.000000,...,0.0,0.029212,0.053413,0.021730,0.000000,0.00000,0.000000,0.000000,0.0,0.0
4,3.0,3.046248,0.000000,0.633708,0.000000,0.667674,0.390672,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407435,0.5,3.147814,0.000000,0.000000,0.000000,0.000000,0.596203,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.023206,0.000000,0.0,0.0
407436,1.5,2.756709,0.518724,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.314810,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.006673,0.0,0.0
407437,3.0,3.996565,0.000000,0.657053,0.000000,0.000000,0.000000,0.000000,0.0,0.337535,...,0.0,0.000000,0.001675,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0
407438,1.0,3.428957,0.000000,0.559550,0.582532,0.589542,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0


In [None]:
#data.to_csv('data.csv', index=False)

In [None]:
#data = pd.read_csv('/content/data.csv')

In [None]:
y = data['rating']
data = data.drop(['rating'], axis=1)
X = data

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [None]:
mse

0.9421503275107974

In [None]:
rmse

0.9706442847463727