In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
import ipywidgets as widgets
from IPython.display import display
from sklearn.neighbors import NearestNeighbors

moviedf = pd.read_csv("rotten_tomatoes_movies.csv")
criticsdf = pd.read_csv("rotten_tomatoes_critic_reviews.csv")

In [17]:
def clean_title(a):
    return re.sub("[^a-zA-Z0-9 ]","",a)

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = moviedf.iloc[indices].iloc[::-1]
    
    return results

moviedf['genres'] = moviedf['genres'].fillna('Unknown')
print(moviedf['genres'].isna().sum())

0


In [18]:
moviedf["Clean Titles"] = moviedf["movie_title"].apply(clean_title)
moviedf['genres'] = moviedf['genres'].str.split(',')
moviedf['genres'] = moviedf['genres'].apply(lambda x: [genre.replace(' ', '') for genre in x])
moviedf['genres'] = moviedf['genres'].apply(lambda x: '-'.join(x))
moviedf['genres'] = moviedf['genres'].apply(lambda x: x.split('-'))
moviedf

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,Clean Titles
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"[Action&Adventure, Comedy, Drama, ScienceFicti...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Percy Jackson the Olympians The Lightning Thief
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,[Comedy],Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19,Please Give
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"[Comedy, Romance]",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8,10
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"[Classics, Drama]",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0,12 Angry Men Twelve Angry Men
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"[Action&Adventure, Drama, Kids&Family]",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3,20000 Leagues Under The Sea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17707,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,,R,"[Drama, Musical&PerformingArts]",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,...,Rotten,56.0,9.0,Upright,74.0,1195.0,2,5,4,Zoot Suit
17708,m/zootopia,Zootopia,From the largest elephant to the smallest shre...,The brilliantly well-rounded Zootopia offers a...,PG,"[Action&Adventure, Animation, Comedy]","Byron Howard, Rich Moore, Jared Bush","Jared Bush, Phil Johnston","J.K. Simmons, Kristen Bell, Octavia Spencer, A...",2016-03-04,...,Certified-Fresh,98.0,291.0,Upright,92.0,101511.0,50,285,7,Zootopia
17709,m/zorba_the_greek,Zorba the Greek,Traveling to inspect an abandoned mine his fat...,,NR,"[Action&Adventure, ArtHouse&International, Cla...",,,"Anthony Quinn, Alan Bates, Irene Papas, Lila K...",1964-12-17,...,Fresh,80.0,10.0,Upright,86.0,7146.0,0,8,2,Zorba the Greek
17710,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",Zulu patiently establishes a cast of colorful ...,PG,"[Classics, Drama]","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,...,Fresh,96.0,23.0,Upright,91.0,30193.0,6,22,1,Zulu


In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(moviedf["Clean Titles"])

In [5]:
equivalents = {
    "A+": 5,
    "A": 4.7058823528,
    "A-": 4.41176470575,
    "B+": 4.1176470587,
    "B": 3.82352941165,
    "B-": 3.5294117646,
    "C+": 3.23529411755,
    "C": 2.9411764705,
    "C-": 2.64705882345,
    "D+": 2.3529411764,
    "D": 2.05882352935,
    "D-": 1.7647058823,
    "E+": 1.47058823525,
    "E": 1.1764705882,
    "E-": 0.88235294115,
    "F+": 0.5882352941,
    "F": 0
    }

def standardize(rev):
    if "/" in str(rev):
        return float(rev[:rev.index("/")])
    else:
        score = equivalents.get(rev)
        if score is not None:
            return score
        else:
            return None

criticsdf.dropna(subset="review_score",inplace=True)
criticsdf["review_score"] = criticsdf["review_score"].apply(standardize)

In [6]:
criticsdf["top_critic"].value_counts()
#criticsdf = criticsdf[criticsdf["top_critic"] == True]

False    638064
True     186017
Name: top_critic, dtype: int64

In [7]:
criticsdf["rotten_tomatoes_link"].value_counts()

m/star_wars_the_rise_of_skywalker     672
m/solo_a_star_wars_story              634
m/star_wars_the_last_jedi             628
m/spider_man_far_from_home            624
m/ready_player_one                    606
                                     ... 
m/end_game_2018                         1
m/bad_blood_the_movie                   1
m/memories_of_the_sword                 1
m/men_of_boys_town                      1
m/everybody_knows_elizabeth_murray      1
Name: rotten_tomatoes_link, Length: 17676, dtype: int64

In [8]:
#In progress search engine. Streamlit has its own native search bar though, so most likely this code will never
#be needed and therefore will remain abandoned.


movie_input = widgets.Text(
    value='The Big Lebowski',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='The Big Lebowski', description='Movie Title:')

Output()

In [9]:
movieid = "falling_down"
similar_users = criticsdf[(criticsdf["rotten_tomatoes_link"]==movieid) | (criticsdf["review_score"] > 1)]["critic_name"].unique()
similar_users

array(['Ben McEachen', 'Bill Goodykoontz', 'Jordan Hoffman', ...,
       'Linda McGee', 'Jerry Bokamper', 'Tom Mansell'], dtype=object)

In [10]:
similar_userecs = criticsdf[(criticsdf["critic_name"].isin(similar_users)) & (criticsdf["review_score"] > 1)]["rotten_tomatoes_link"]
similar_userecs = similar_userecs.value_counts() / len(similar_users)
similar_userecs

m/star_wars_the_rise_of_skywalker            0.096594
m/star_wars_the_last_jedi                    0.092190
m/solo_a_star_wars_story                     0.091603
m/spider_man_far_from_home                   0.091016
m/star_wars_episode_vii_the_force_awakens    0.087199
                                               ...   
m/altina                                     0.000147
m/dawn_patrol_2015                           0.000147
m/surf_nazis_must_die                        0.000147
m/atlas_shrugged_who_is_john_galt            0.000147
m/1194893-salaam_namaste                     0.000147
Name: rotten_tomatoes_link, Length: 17658, dtype: float64

In [11]:
all_users = criticsdf[(criticsdf["rotten_tomatoes_link"].isin(similar_userecs.index)) & (criticsdf["review_score"] > 1)]
all_user_recs = all_users["rotten_tomatoes_link"].value_counts() / len(all_users["critic_name"].unique())
all_user_recs

m/star_wars_the_rise_of_skywalker            0.096594
m/star_wars_the_last_jedi                    0.092190
m/solo_a_star_wars_story                     0.091603
m/spider_man_far_from_home                   0.091016
m/star_wars_episode_vii_the_force_awakens    0.087199
                                               ...   
m/altina                                     0.000147
m/dawn_patrol_2015                           0.000147
m/surf_nazis_must_die                        0.000147
m/atlas_shrugged_who_is_john_galt            0.000147
m/1194893-salaam_namaste                     0.000147
Name: rotten_tomatoes_link, Length: 17658, dtype: float64

In [12]:
w=1   #W is for Weight
rec_percentages = pd.concat([similar_userecs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [13]:
rec_percentages

Unnamed: 0,similar,all,score
m/star_wars_the_rise_of_skywalker,0.096594,0.096594,1.0
m/big_bad_mama,0.001615,0.001615,1.0
m/the_decline_of_western_civilization_1988,0.001615,0.001615,1.0
m/three_men_and_a_little_lady,0.001615,0.001615,1.0
m/never_apologize_a_personal_visit_with_lindsay_anderson_2007,0.001615,0.001615,1.0
...,...,...,...
m/kevin_hart_let_me_explain,0.005578,0.005578,1.0
m/sleep_dealer,0.005578,0.005578,1.0
m/see_you_yesterday,0.005578,0.005578,1.0
m/conversation,0.005578,0.005578,1.0


In [14]:
similar_userecs.head(5)
#similar_userecs = similar_userecs[similar_userecs > 0.001]

m/star_wars_the_rise_of_skywalker            0.096594
m/star_wars_the_last_jedi                    0.092190
m/solo_a_star_wars_story                     0.091603
m/spider_man_far_from_home                   0.091016
m/star_wars_episode_vii_the_force_awakens    0.087199
Name: rotten_tomatoes_link, dtype: float64