In [3]:

import streamlit as st
import pandas as pd

def prepare_data():
    links_df = pd.read_csv("datasets/ml-latest-small/links.csv")
    movies_df = pd.read_csv("datasets/ml-latest-small/movies.csv")
    ratings_df = pd.read_csv("datasets/ml-latest-small/ratings.csv")
    tags_df = pd.read_csv("datasets/ml-latest-small/tags.csv")
    return (links_df, movies_df, ratings_df, tags_df)

In [4]:
links_df, movies_df, ratings_df, tags_df = prepare_data()

In [64]:
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [15]:
movies_df.query("movieId == 193581")

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy


In [48]:
DEFAULT_MOOD = "Choose a Mood"
RANDOM_MOOD = "Surprise Me!"
N_MOVIES = 5
def n_top_movies_weighted(names_df, ratings_df, n_top=10, mode=RANDOM_MOOD):

    rating_info = (
        ratings_df
        .groupby("movieId")
        .agg(rate_mean=("rating", "mean"), rate_count=("rating", "count"))
        .reset_index()
    )
    rating_info["weighted_score"] = (
        rating_info.rate_count/rating_info.rate_count.sum()) * rating_info.rate_mean

    rating_info = rating_info.merge(names_df, how="left")
    if mode != RANDOM_MOOD:
        rating_info = rating_info.query("genres.str.contains(@mode)")

    rating_info = rating_info.nlargest(n_top, "weighted_score")

    # return rating_info[["title", "genres", "weighted_score", "rate_mean", "rate_count"]].reset_index().drop(columns="index")
    return rating_info[["title", "genres", "movieId"]].reset_index().drop(columns="index")

def construct_imdb_url(movie_id):
    #movie_id = movie_row.movieId
    imdb_tag = int(links_df.query("movieId == @movie_id").imdbId)
    imdb_tag = str(imdb_tag).rjust(7,"0")
    imdb_tag = "tt"+imdb_tag
    
    url= "https://www.imdb.com/title/"+imdb_tag
    return(url)

#construct_imdb_url(193581)

In [31]:
n_top_movies_weighted(movies_df,ratings_df,mode="Romance")

Unnamed: 0,title,genres,movieId
0,Forrest Gump (1994),Comedy|Drama|Romance|War,356
1,American Beauty (1999),Drama|Romance,2858
2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,4306
3,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,380
4,Speed (1994),Action|Romance|Thriller,377
5,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197
6,Good Will Hunting (1997),Drama|Romance,1704
7,Groundhog Day (1993),Comedy|Fantasy|Romance,1265
8,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,595
9,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,7361


In [32]:
top_movies_list = n_top_movies_weighted(
            n_top=N_MOVIES*2, names_df=movies_df, ratings_df=ratings_df, mode="Romance")
top_movies_list

Unnamed: 0,title,genres,movieId
0,Forrest Gump (1994),Comedy|Drama|Romance|War,356
1,American Beauty (1999),Drama|Romance,2858
2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,4306
3,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,380
4,Speed (1994),Action|Romance|Thriller,377
5,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197
6,Good Will Hunting (1997),Drama|Romance,1704
7,Groundhog Day (1993),Comedy|Fantasy|Romance,1265
8,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,595
9,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,7361


In [None]:
top_movies_list["title"] = top_movies_list.apply(
            lambda x: f'<a href={construct_imdb_url(x.movieId)}>{x.title}</a>')

In [37]:
top_movies_list

Unnamed: 0,title,genres,movieId
0,Forrest Gump (1994),Comedy|Drama|Romance|War,356
1,American Beauty (1999),Drama|Romance,2858
2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,4306
3,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,380
4,Speed (1994),Action|Romance|Thriller,377
5,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197
6,Good Will Hunting (1997),Drama|Romance,1704
7,Groundhog Day (1993),Comedy|Fantasy|Romance,1265
8,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,595
9,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,7361


In [62]:
list_temp = top_movies_list.copy()

In [65]:
construct_imdb_url(1)

'https://www.imdb.com/title/tt0114709'

In [63]:
list_temp.title=list_temp.apply(lambda x: f'<a href="{construct_imdb_url(x.movieId)}">{x.title}</a>',axis=1)
list_temp.style.format()


Unnamed: 0,title,genres,movieId
0,Forrest Gump (1994),Comedy|Drama|Romance|War,356
1,American Beauty (1999),Drama|Romance,2858
2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance,4306
3,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,380
4,Speed (1994),Action|Romance|Thriller,377
5,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197
6,Good Will Hunting (1997),Drama|Romance,1704
7,Groundhog Day (1993),Comedy|Fantasy|Romance,1265
8,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,595
9,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,7361


In [42]:
top_movies_list.movieId

0     356
1    2858
2    4306
3     380
4     377
5    1197
6    1704
7    1265
8     595
9    7361
Name: movieId, dtype: int64

In [46]:
top_movies_list.apply(construct_imdb_url,axis=1)

<class 'pandas.core.series.Series'>
title           Forrest Gump (1994)
genres     Comedy|Drama|Romance|War
movieId                         356
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
title      American Beauty (1999)
genres              Drama|Romance
movieId                      2858
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
title                                           Shrek (2001)
genres     Adventure|Animation|Children|Comedy|Fantasy|Ro...
movieId                                                 4306
Name: 2, dtype: object
<class 'pandas.core.series.Series'>
title                              True Lies (1994)
genres     Action|Adventure|Comedy|Romance|Thriller
movieId                                         380
Name: 3, dtype: object
<class 'pandas.core.series.Series'>
title                 Speed (1994)
genres     Action|Romance|Thriller
movieId                        377
Name: 4, dtype: object
<class 'pandas.core.series.Series'>
title          

0    https://www.imdb.com/title/tt0109830
1    https://www.imdb.com/title/tt0169547
2    https://www.imdb.com/title/tt0126029
3    https://www.imdb.com/title/tt0111503
4    https://www.imdb.com/title/tt0111257
5    https://www.imdb.com/title/tt0093779
6    https://www.imdb.com/title/tt0119217
7    https://www.imdb.com/title/tt0107048
8    https://www.imdb.com/title/tt0101414
9    https://www.imdb.com/title/tt0338013
dtype: object

In [11]:
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [4]:
chosen_movie_title = "Star Trek (2009)"

In [8]:
int(movies_df.query("title == @chosen_movie_title").movieId)

68358

In [9]:
rating = (
            ratings_df
            .groupby("movieId")
            .agg(rate_mean=("rating", "mean"), rate_count=("rating", "count"))
            .reset_index())
m= rating.rate_count.quantile(0.95)
c= rating.rate_mean.mean()

In [66]:
rating

Unnamed: 0,movieId,rate_mean,rate_count
0,1,3.920930,215
1,2,3.431818,110
2,3,3.259615,52
3,4,2.357143,7
4,5,3.071429,49
...,...,...,...
9719,193581,4.000000,1
9720,193583,3.500000,1
9721,193585,3.500000,1
9722,193587,3.500000,1


In [25]:
len(movies_df.movieId.unique())

9742

In [26]:
import streamlit as st
import pandas as pd
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import requests

url="https://www.imdb.com/title/tt0109830/"
headers = {'Accept-Language': 'en-US,en;q=0.8','User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers = headers)

response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

In [27]:
image_url = (soup
 .select("div.ipc-poster.ipc-poster--baseAlt.ipc-poster--dynamic-width.sc-30a29d44-0.dktfIa.celwidget.ipc-sub-grid-item")[0]
 .select("a")[0]
 .get("href"))
image_url = "https://www.imdb.com"+image_url
image_url

'https://www.imdb.com/title/tt0109830/mediaviewer/rm1954748672/?ref_=tt_ov_i'

In [33]:

response2 = requests.get(image_url, headers = headers)
response2.raise_for_status()
soup2 = BeautifulSoup(response2.content, "html.parser")
soup2.select("div.media-viewer")[0].select("img")[0].get("srcset").split(" ")[0]

'https://m.media-amazon.com/images/M/MV5BNWIwODRlZTUtY2U3ZS00Yzg1LWJhNzYtMmZiYmEyNmU1NjMzXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_FMjpg_UY463_.jpg'

In [20]:
import urllib.request
import requests
from bs4 import BeautifulSoup


def construct_imdb_url(movie_id):
    imdb_tag = int(links_df.query("movieId == @movie_id").imdbId)
    imdb_tag = str(imdb_tag).rjust(7, "0")
    imdb_tag = "tt"+imdb_tag

    url = "https://www.imdb.com/title/"+imdb_tag
    return (url)
#-----------------------------------------------------------------------------------------#
def scrap_movie_data (movie_id):
    url = construct_imdb_url(movie_id)
    headers = {'Accept-Language': 'en-US,en;q=0.8','User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers = headers)

    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")
    image_url = (soup
                 .select("div.ipc-poster.ipc-poster--baseAlt.ipc-poster--dynamic-width.sc-30a29d44-0.dktfIa.celwidget.ipc-sub-grid-item")[0]
                 .select("a")[0]
                 .get("href"))
    image_url = "https://www.imdb.com"+image_url
    response2 = requests.get(image_url, headers = headers)
    response2.raise_for_status()
    soup2 = BeautifulSoup(response2.content, "html.parser")
    image_jpg_url = soup2.select("div.media-viewer")[0].select("img")[0].get("src")
    file_path = "movies_images/"+str(movie_id)+".jpg"
    urllib.request.urlretrieve(image_jpg_url, file_path)
    #return st.image(image_jpg,width=100)
    #return st.markdown("![image_title](image_jpg)")
    #return image_jpg


In [21]:
for movie_id in movies_df.movieId:
    scrap_movie_data(movie_id)

KeyboardInterrupt: 

In [36]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [40]:
movie_test=movies_df.copy()
movie_test.rename(columns={"movieId":"MovieID"})


Unnamed: 0,MovieID,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [39]:
movie_test=movies_df.copy()
movie_test.columns = ["MovieId","Title","Genres"]
movie_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieId  9742 non-null   int64 
 1   Title    9742 non-null   object
 2   Genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
