# Movie recommendation app
## Concatenate movie dataframes

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Concatenate dataframes
movies = pd.read_csv("../data/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/tmdb_5000_credits.csv")
df = pd.concat((movies, credits[["cast", "crew"]]), axis=1)

# Save to disk
df.to_csv("../data/tmdb_5000_all.csv")

## Format movie data

In [2]:
# Format movie data so that each movie title is associated with a string reporting its info.
import utils.helper as helper
import importlib

importlib.reload(helper)

df = helper.format_data("../data/tmdb_5000_all.csv")


## Build the similarity matrix

In [3]:
import nltk
from nltk.corpus import stopwords
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity

# Download nltk to get stopwords english dictionary
nltk.download('stopwords')

# Use TfIdf to build the vector space
stopwords.words('english')
stop_words = ["id", "name"]
stop_words = stop_words.append(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=300)
X = vectorizer.fit_transform(df["info"])

# Build the sparse matrix
sparse = pd.DataFrame.sparse.from_spmatrix(X)

# Build the cosine similarity matrix and save it to disk
cos_sim = cosine_similarity(sparse.values, sparse.values)
cos_df = pd.DataFrame(index=df["index"], columns=df["index"],data=cos_sim)
cos_df.to_csv("../data/movie_similarity.csv") 


[nltk_data] Downloading package stopwords to /home/mattia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
movies = pd.read_csv("../data/movie_similarity.csv")

In [18]:
movies

Unnamed: 0,index,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
0,Avatar,1.000000,0.905727,0.981379,0.946912,0.917639,0.836681,0.929121,0.936361,0.891369,...,0.807459,0.777603,0.759685,0.829812,0.222995,0.831986,0.863172,0.859034,0.627765,0.704060
1,Pirates of the Caribbean: At World's End,0.905727,1.000000,0.914234,0.924296,0.877692,0.864977,0.856243,0.960873,0.956901,...,0.910790,0.859213,0.843405,0.929433,0.258891,0.893447,0.910006,0.923213,0.697707,0.777417
2,Spectre,0.981379,0.914234,1.000000,0.935647,0.935860,0.816609,0.934548,0.946347,0.899780,...,0.818356,0.784650,0.768305,0.839208,0.234065,0.846698,0.877438,0.873338,0.646884,0.712834
3,The Dark Knight Rises,0.946912,0.924296,0.935647,1.000000,0.900298,0.918598,0.866292,0.950642,0.916833,...,0.845031,0.802608,0.794252,0.861880,0.229695,0.848672,0.883448,0.872854,0.656633,0.726299
4,John Carter,0.917639,0.877692,0.935860,0.900298,1.000000,0.747916,0.882316,0.899253,0.852566,...,0.738225,0.740451,0.713819,0.770119,0.243995,0.822869,0.846844,0.860805,0.600162,0.671729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,El Mariachi,0.831986,0.893447,0.846698,0.848672,0.822869,0.775440,0.821083,0.888845,0.870696,...,0.830175,0.833022,0.821107,0.868120,0.374723,1.000000,0.855963,0.886767,0.685875,0.757489
4796,Newlyweds,0.863172,0.910006,0.877438,0.883448,0.846844,0.803026,0.839482,0.909894,0.891059,...,0.829663,0.819754,0.801576,0.863504,0.258590,0.855963,1.000000,0.908517,0.680705,0.738050
4797,"Signed, Sealed, Delivered",0.859034,0.923213,0.873338,0.872854,0.860805,0.796448,0.846614,0.906378,0.892382,...,0.842015,0.853523,0.819745,0.880609,0.303951,0.886767,0.908517,1.000000,0.697820,0.769536
4798,Shanghai Calling,0.627765,0.697707,0.646884,0.656633,0.600162,0.657684,0.596187,0.701011,0.714090,...,0.692003,0.680769,0.694993,0.719408,0.325914,0.685875,0.680705,0.697820,1.000000,0.637923


In [21]:
tmp = movies[["index", "Avatar"]].sort_values(by="Avatar", ascending=False)[1:11]
for i in tmp["index"].values:
    print(i)

The Hunger Games: Mockingjay - Part 1
Jupiter Ascending
The Martian
The Amazing Spider-Man
Spectre
Captain America: The First Avenger
2012
The Girl with the Dragon Tattoo
Battle: Los Angeles
300: Rise of an Empire
