In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [49]:
# Load the MovieLens dataset
movies = pd.read_csv('https://raw.githubusercontent.com/codeheroku/Introduction-to-Machine-Learning/master/Building%20a%20Movie%20Recommendation%20Engine/movie_dataset.csv')

# Display the first few rows of the dataset
print(movies.head())

   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [50]:
movies = movies[['title', 'overview']].dropna().reset_index(drop=True)

In [51]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the movie overviews
tfidf_matrix = vectorizer.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [52]:
def get_recommendations(title, cosine_sim=cosine_sim, to_return=10):
    idx = movies.index[movies['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:to_return]  # Получаем 10 самых похожих фильмов
    movie_indices = [i[0] for i in sim_scores]
    return movies['genre'].iloc[movie_indices]

In [55]:
print(get_recommendations('The Matrix', to_return=10))

1281                       Hackers
2995    Mad Max Beyond Thunderdome
2088                         Pulse
1341          The Inhabited Island
333                  Transcendence
0                           Avatar
261          Live Free or Die Hard
775                      Supernova
125            The Matrix Reloaded
Name: title, dtype: object


In [61]:
# Do the same thing but with genres
movies = pd.read_csv('https://raw.githubusercontent.com/codeheroku/Introduction-to-Machine-Learning/master/Building%20a%20Movie%20Recommendation%20Engine/movie_dataset.csv')

# Preprocess the genres column
movies['genres'] = movies['genres'].fillna('').apply(lambda x: ' '.join(x.lower().replace(' ', '').split('|')))

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the movie content
tfidf_matrix = vectorizer.fit_transform(movies['genres'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_recommendations(title, cosine_sim=cosine_sim, to_return=10):
    idx = movies.index[movies['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:to_return]  # Получаем 10 самых похожих фильмов
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

print(get_recommendations('The Matrix', to_return=10))

223         The Chronicles of Riddick
224                           RoboCop
266                          I, Robot
415                             Dredd
582               Battle: Los Angeles
634                        The Matrix
854                          Æon Flux
1153                             Lucy
1270    Universal Soldier: The Return
Name: title, dtype: object
