# Movie recommendation app
## Concatenate movie dataframes

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Concatenate dataframes
movies = pd.read_csv("../data/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/tmdb_5000_credits.csv")
df = pd.concat((movies, credits[["cast", "crew"]]), axis=1)

# Save to disk
df.to_csv("../data/tmdb_5000_all.csv")

## Format movie data

In [2]:
# Format movie data so that each movie title is associated with a string reporting its info.
import utils.helper as helper
import importlib

importlib.reload(helper)

df = helper.format_data("../data/tmdb_5000_all.csv")


## Build the similarity matrix

In [3]:
import nltk
from nltk.corpus import stopwords
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity

# Download nltk to get stopwords english dictionary
nltk.download('stopwords')

# Use TfIdf to build the vector space
stopwords.words('english')
stop_words = ["id", "name"]
stop_words = stop_words.append(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=300)
X = vectorizer.fit_transform(df["info"])

# Build the sparse matrix
sparse = pd.DataFrame.sparse.from_spmatrix(X)

# Build the cosine similarity matrix and save it to disk
cos_sim = cosine_similarity(sparse.values, sparse.values)
cos_df = pd.DataFrame(index=df["index"], columns=df["index"],data=cos_sim)
cos_df.to_csv("../data/movie_similarity.csv") 


[nltk_data] Downloading package stopwords to /home/mattia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
movies = pd.read_csv("../data/movie_similarity.csv")

In [9]:
tmp = cos_df["Avatar"].sort_values(ascending=False)[1:11]
for i in tmp.index.values:
    print(i)

The Hunger Games: Mockingjay - Part 1
Jupiter Ascending
The Martian
The Amazing Spider-Man
Spectre
Captain America: The First Avenger
2012
The Girl with the Dragon Tattoo
Battle: Los Angeles
300: Rise of an Empire
