In [30]:
#import packages
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures
import pandas as pd

In [31]:
# Maximum number of threads 
MAX_THREADS = 50

#create empty lists
movie_title_arr = []
movie_year_arr = []
movie_genre_arr = []
movie_synopsis_arr =[]

#### Following functions for scraping of IMDB comes from: https://python.plainenglish.io/how-to-scrape-imdb-data-9d7535b98576 (Shiv Kumar Ganesh)

In [33]:
#get title
def getMovieTitle(header):
    try:
        return header[0].find("a").getText()
    except:
        return 'NA'
#get release year
def getReleaseYear(header):
    try:
        return header[0].find("span",  {"class": "lister-item-year text-muted unbold"}).getText()
    except:
        return 'NA'
#get genre
def getGenre(muted_text):
    try:
        return muted_text.find("span",  {"class":  "genre"}).getText()
    except:
        return 'NA'
#get synopsis
def getsynopsys(movie):
    try:
        return movie.find_all("p", {"class":  "text-muted"})[1].getText()
    except:
        return 'NA'

In [34]:
#https://python.plainenglish.io/how-to-scrape-imdb-data-9d7535b98576 (continued)
#scrape imdb with beautifulsoup
def main(imdb_url):
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "lister-item mode-advanced"})
    
    for movie in movies_list:
        header = movie.find_all("h3", {"class":  "lister-item-header"})
        muted_text = movie.find_all("p", {"class":  "text-muted"})[0]
        
        #  Movie Title
        movie_title =  getMovieTitle(header)
        movie_title_arr.append(movie_title)
        
        #  Movie release year
        year = getReleaseYear(header)
        movie_year_arr.append(year)
        
        #  Genre  of movie
        genre = getGenre(muted_text)
        movie_genre_arr.append(genre)
        
        # Movie Synopsys
        synopsis = getsynopsys(movie)
        movie_synopsis_arr.append(synopsis)
        
        

In [35]:
#https://python.plainenglish.io/how-to-scrape-imdb-data-9d7535b98576 (continued)
# Create list to store all the URLs that are being queried
imageArr = []

# Maximum number of pages to iterate over
MAX_PAGE =51

# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    totalRecords = 0 if i==0 else (250*i)+1
    print(totalRecords)
    imdb_url = f'https://www.imdb.com/search/title/?release_date=2020-01-02,2021-02-01&user_rating=4.0,10.0&languages=en&count=250&start={totalRecords}&ref_=adv_nxt'
    imageArr.append(imdb_url)

0
251
501
751
1001
1251
1501
1751
2001
2251
2501
2751
3001
3251
3501
3751
4001
4251
4501
4751
5001
5251
5501
5751
6001
6251
6501
6751
7001
7251
7501
7751
8001
8251
8501
8751
9001
9251
9501
9751
10001
10251
10501
10751
11001
11251
11501
11751
12001
12251
12501


In [36]:
#https://python.plainenglish.io/how-to-scrape-imdb-data-9d7535b98576 (continued)
def download_stories(story_urls):
    threads = min(MAX_THREADS, len(story_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, story_urls)

In [37]:
#https://python.plainenglish.io/how-to-scrape-imdb-data-9d7535b98576 (continued)
# Call the download function with the array of URLS called imageArr
download_stories(imageArr)

# Attach all the data to the pandas dataframe. 
df = pd.DataFrame({
    "Title": movie_title_arr,
    "Release_Year": movie_year_arr,
    "Genre": movie_genre_arr,
    "Synopsis": movie_synopsis_arr,
})

print('--------- Download Complete CSV Formed --------')


df.head()

--------- Download Complete CSV Formed --------


Unnamed: 0,Title,Release_Year,Genre,Synopsis
0,Gangs of London,(2020– ),"\nAction, Crime, Drama",\nTells the story of London being torn apart b...
1,Ted Lasso,(2020– ),"\nComedy, Drama, Sport",\nAmerican college football coach Ted Lasso he...
2,Big Sky,(2020– ),"\nCrime, Drama, Mystery",\nA private detective teams up with an ex-cop ...
3,Avenue 5,(2020– ),"\nComedy, Sci-Fi","\nThe troubled crew of Avenue 5, a space cruis..."
4,Hubie Halloween,(2020),"\nComedy, Mystery",\nDespite his devotion to his hometown of Sale...


#### The next part of the notebook follows Emma Grimaldi's work from https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

In [38]:
# making the genre column into a list of words 
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))


In [39]:
df.head()

Unnamed: 0,Title,Release_Year,Genre,Synopsis
0,Gangs of London,(2020– ),"[\naction, crime, drama ]",\nTells the story of London being torn apart b...
1,Ted Lasso,(2020– ),"[\ncomedy, drama, sport ]",\nAmerican college football coach Ted Lasso he...
2,Big Sky,(2020– ),"[\ncrime, drama, mystery ]",\nA private detective teams up with an ex-cop ...
3,Avenue 5,(2020– ),"[\ncomedy, sci-fi ]","\nThe troubled crew of Avenue 5, a space cruis..."
4,Hubie Halloween,(2020),"[\ncomedy, mystery ]",\nDespite his devotion to his hometown of Sale...


In [40]:
#import more packages
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# creating a new column called Key Words
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Synopsis']
    
    # Rake, by default, uses English stopwords from NLTK package
    # and discards all punctuation 
    r = Rake()

    # passing the synopsis text in order to extract key words 
    r.extract_keywords_from_text(plot)

    # getting a dictionary with key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # putting the key words in the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the synopsis column
df.drop(columns = ['Synopsis'], inplace = True)

In [41]:
df.head()

Unnamed: 0,Title,Release_Year,Genre,Key_words
0,Gangs of London,(2020– ),"[\naction, crime, drama ]","[tells, story, london, torn, apart, turbulent,..."
1,Ted Lasso,(2020– ),"[\ncomedy, drama, sport ]","[american, college, football, coach, ted, lass..."
2,Big Sky,(2020– ),"[\ncrime, drama, mystery ]","[private, detective, teams, ex, cop, solve, ki..."
3,Avenue 5,(2020– ),"[\ncomedy, sci-fi ]","[troubled, crew, avenue, 5, space, cruise, shi..."
4,Hubie Halloween,(2020),"[\ncomedy, mystery ]","[despite, devotion, hometown, salem, halloween..."


In [42]:
#set Title as index
#df.set_index('Title', inplace = True)
#df.head()

Unnamed: 0_level_0,Release_Year,Genre,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gangs of London,(2020– ),"[\naction, crime, drama ]","[tells, story, london, torn, apart, turbulent,..."
Ted Lasso,(2020– ),"[\ncomedy, drama, sport ]","[american, college, football, coach, ted, lass..."
Big Sky,(2020– ),"[\ncrime, drama, mystery ]","[private, detective, teams, ex, cop, solve, ki..."
Avenue 5,(2020– ),"[\ncomedy, sci-fi ]","[troubled, crew, avenue, 5, space, cruise, shi..."
Hubie Halloween,(2020),"[\ncomedy, mystery ]","[despite, devotion, hometown, salem, halloween..."


In [43]:
#we'll use the 'bag of words' method for content based recommendations.

#create new column for bag of words
#words in the columns get put together in this new column
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Release_Year':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [61]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
Gangs of London,(2020– ) \naction crime drama te...
Ted Lasso,(2020– ) \ncomedy drama sport am...
Big Sky,(2020– ) \ncrime drama mystery p...
Avenue 5,(2020– ) \ncomedy sci-fi troubled...
Hubie Halloween,(2020) \ncomedy mystery despite d...


In [52]:
# creating the the count matrix for the bag of words
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a series for the movie titles so they are associated with an ordered numerical list
# which will be used use later to match the indexes
indices = pd.Series(df.index)
indices

0                 Gangs of London
1                       Ted Lasso
2                         Big Sky
3                        Avenue 5
4                 Hubie Halloween
                   ...           
12745           The World to Come
12746    Ma Rainey's Black Bottom
12747      Black Panther: Origins
12748                  Love Sarah
12749                      Mayday
Name: Title, Length: 12750, dtype: object

In [46]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.13693064, 0.22645541, ..., 0.20412415, 0.13693064,
        0.04166667],
       [0.13693064, 1.        , 0.12403473, ..., 0.0559017 , 0.2       ,
        0.04564355],
       [0.22645541, 0.12403473, 1.        , ..., 0.06933752, 0.12403473,
        0.1132277 ],
       ...,
       [0.20412415, 0.0559017 , 0.06933752, ..., 1.        , 0.0559017 ,
        0.        ],
       [0.13693064, 0.2       , 0.12403473, ..., 0.0559017 , 1.        ,
        0.04564355],
       [0.04166667, 0.04564355, 0.1132277 , ..., 0.        , 0.04564355,
        1.        ]])

In [47]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [72]:
#example - If I enter Psycho, the top 10 movie recs are below
recommendations('Psycho')

['Jump',
 'The Campaign Against the Climate',
 'Valentines Date',
 'Al Hakawati',
 'Ashens and the Polybius Heist',
 'Sunburn',
 'Oh Deer',
 'Indestructible: No Place to Hide',
 'Madam',
 'The Hunt']