In [1]:
import pandas as pd
import numpy as np

cred = pd.read_csv('./data/tmdb_5000_credits.csv')
movies = pd.read_csv('./data/tmdb_5000_movies.csv')

In [2]:
cred.columns = ['id','tittle','cast','crew']
df= movies.merge(cred,on='id')

In [19]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [16]:
df['id']

0        19995
1          285
2       206647
3        49026
4        49529
         ...  
4798      9367
4799     72766
4800    231617
4801    126186
4802     25975
Name: id, Length: 4803, dtype: int64

In [18]:
df[df['id'] == 206647]['title']

2    Spectre
Name: title, dtype: object

In [32]:
C = df['vote_average'].mean()
C

6.092171559442016

In [33]:
m = df['vote_count'].quantile(0.9)
m

1838.4000000000015

In [34]:
q_movies = df.copy().loc[df['vote_count'] >= m]
q_movies.shape

(481, 23)

In [35]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']

    return (v/(v+m) * R) + (m/(m+v) * C)

In [36]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [37]:
q_movies = q_movies.sort_values('score', ascending=False)

In [38]:
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


In [39]:
df['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(4803, 20978)

In [46]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [47]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [55]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [56]:
get_recommendations('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object

In [None]:
def _weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

def cal_cosine_sim(df):
    C = df['vote_average'].mean()
    m = df['vote_count'].quantile(0.9)
    q_movies = df.copy().loc[df['vote_count'] >= m]
    q_movies['score'] = q_movies.apply(_weighted_rating, axis=1)
    q_movies = q_movies.sort_values('score', ascending=False)
    tfidf = TfidfVectorizer(stop_words='english')
    df['overview'] = df['overview'].fillna('')
    tfidf_matrix = tfidf.fit_transform(df['overview'])
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    return indices, cosine_sim

def get_recommendations(title, indices, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [65]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [66]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [68]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [69]:
df['director'] = df['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [70]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [71]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [72]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [75]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [77]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

In [7]:
response = {
  "dates": {
    "maximum": "2023-12-06",
    "minimum": "2023-10-25"
  },
  "page": 1,
  "results": [
    {
      "adult": False,
      "backdrop_path": "/t5zCBSB5xMDKcDqe91qahCOUYVV.jpg",
      "genre_ids": [
        27,
        9648
      ],
      "id": 507089,
      "original_language": "en",
      "original_title": "Five Nights at Freddy's",
      "overview": "Recently fired and desperate for work, a troubled young man named Mike agrees to take a position as a night security guard at an abandoned theme restaurant: Freddy Fazbear's Pizzeria. But he soon discovers that nothing at Freddy's is what it seems.",
      "popularity": 815.968,
      "poster_path": "/j9mH1pr3IahtraTWxVEMANmPSGR.jpg",
      "release_date": "2023-11-15",
      "title": "Five Nights at Freddy's",
      "video": False,
      "vote_average": 7.852,
      "vote_count": 2588
    },
    {
      "adult": False,
      "backdrop_path": "/f1AQhx6ZfGhPZFTVKgxG91PhEYc.jpg",
      "genre_ids": [
        18,
        36,
        10752
      ],
      "id": 753342,
      "original_language": "en",
      "original_title": "Napoleon",
      "overview": "An epic that details the checkered rise and fall of French Emperor Napoleon Bonaparte and his relentless journey to power through the prism of his addictive, volatile relationship with his wife, Josephine.",
      "popularity": 547.172,
      "poster_path": "/jE5o7y9K6pZtWNNMEw3IdpHuncR.jpg",
      "release_date": "2023-12-06",
      "title": "Napoleon",
      "video": False,
      "vote_average": 6.456,
      "vote_count": 398
    },
    {
      "adult": False,
      "backdrop_path": "/5a4JdoFwll5DRtKMe7JLuGQ9yJm.jpg",
      "genre_ids": [
        28,
        10749,
        18
      ],
      "id": 695721,
      "original_language": "en",
      "original_title": "The Hunger Games: The Ballad of Songbirds & Snakes",
      "overview": "64 years before he becomes the tyrannical president of Panem, Coriolanus Snow sees a chance for a change in fortunes when he mentors Lucy Gray Baird, the female tribute from District 12.",
      "popularity": 468.282,
      "poster_path": "/mBaXZ95R2OxueZhvQbcEWy2DqyO.jpg",
      "release_date": "2023-11-15",
      "title": "The Hunger Games: The Ballad of Songbirds & Snakes",
      "video": False,
      "vote_average": 7.315,
      "vote_count": 483
    },
    {
      "adult": False,
      "backdrop_path": "/feSiISwgEpVzR1v3zv2n2AU4ANJ.jpg",
      "genre_ids": [
        878,
        12,
        28
      ],
      "id": 609681,
      "original_language": "en",
      "original_title": "The Marvels",
      "overview": "Carol Danvers, aka Captain Marvel, has reclaimed her identity from the tyrannical Kree and taken revenge on the Supreme Intelligence. But unintended consequences see Carol shouldering the burden of a destabilized universe. When her duties send her to an anomalous wormhole linked to a Kree revolutionary, her powers become entangled with that of Jersey City super-fan Kamala Khan, aka Ms. Marvel, and Carol’s estranged niece, now S.A.B.E.R. astronaut Captain Monica Rambeau. Together, this unlikely trio must team up and learn to work in concert to save the universe.",
      "popularity": 422.261,
      "poster_path": "/Ag3D9qXjhJ2FUkrlJ0Cv1pgxqYQ.jpg",
      "release_date": "2023-11-08",
      "title": "The Marvels",
      "video": False,
      "vote_average": 6.6,
      "vote_count": 576
    },
    {
      "adult": False,
      "backdrop_path": "/iiXliCeykkzmJ0Eg9RYJ7F2CWSz.jpg",
      "genre_ids": [
        28,
        53,
        80
      ],
      "id": 762430,
      "original_language": "en",
      "original_title": "Retribution",
      "overview": "When a mysterious caller puts a bomb under his car seat, Matt Turner begins a high-speed chase across the city to complete a specific series of tasks- all with his kids trapped in the back seat.",
      "popularity": 308.888,
      "poster_path": "/oUmmY7QWWn7OhKlcPOnirHJpP1F.jpg",
      "release_date": "2023-12-06",
      "title": "Retribution",
      "video": False,
      "vote_average": 6.988,
      "vote_count": 694
    }
  ],
  "total_pages": 5,
  "total_results": 93
}

In [9]:
response.keys()

dict_keys(['dates', 'page', 'results', 'total_pages', 'total_results'])

In [27]:
total_result = pd.DataFrame()
for idx in response['results']:
    sub_result = pd.DataFrame({'movie_id' : [idx['id']],
                  'overview' : [idx['overview']]})
    
    total_result = pd.concat([total_result, sub_result], axis=0)

In [28]:
total_result

Unnamed: 0,movie_id,overview
0,507089,"Recently fired and desperate for work, a troub..."
0,753342,An epic that details the checkered rise and fa...
0,695721,64 years before he becomes the tyrannical pres...
0,609681,"Carol Danvers, aka Captain Marvel, has reclaim..."
0,762430,When a mysterious caller puts a bomb under his...


In [13]:
movie_id = response['results'][0]['id']
overview = response['results'][0]['overview']

"Recently fired and desperate for work, a troubled young man named Mike agrees to take a position as a night security guard at an abandoned theme restaurant: Freddy Fazbear's Pizzeria. But he soon discovers that nothing at Freddy's is what it seems."

In [23]:
for i in range(2, response['total_pages']+1):
    print(i)

2
3
4
5
