**NB01-Data-Collection**


**Scraping the data of top 97 movies from the given web page**

In [1]:
%load_ext sql

import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import sqlite3

# URL of the Rotten Tomatoes Movies page
url = "https://editorial.rottentomatoes.com/guide/oscars-best-and-worst-best-pictures/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch the webpage
response = requests.get(url, headers=headers)
response.raise_for_status()  # Check that the request was successful

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing the top 250 movies
rows = soup.find_all('div', {"class": "row countdown-item"})

# Lists to hold the extracted data
ranks = []
titles = []
#years = []
rel_years = []
ratings = []
movie_key_hdr = []


movie_key_dtl = []
actors = []
movie_uuid = 0

# Loop through each row and extract the data
for row in rows:
    rank = row.find('div', {'class': 'countdown-index'}).text.strip()
    title = row.find("div", {"class": "article_movie_title"}).find('a',).text.strip()
    rel_year = row.find('span', {'class': 'subtle start-year'}).text.strip()
    rel_year = int(rel_year[1:5])
    #rel_year = pd.to_datetime(int(rel_year[1:5]),format='%Y')
    rating = row.find('span', {'class': 'tMeterScore'}).text.strip()
    rating = int(rating[0:2])
    movie_actors = [x.text.strip() for x in row.find('div', {'class': 'info cast'}).find_all('a')]
    
    
    ranks.append(rank)
    titles.append(title)
    #years.append(rel_year)
    rel_years.append(rel_year)
    ratings.append(rating)
    movie_key_hdr.append(movie_uuid)

    actors.extend(movie_actors)
    movie_key_dtl.extend([movie_uuid] * len(movie_actors))
    
    #movie_uuid = uuid.uuid4()
    movie_uuid = movie_uuid + 1


# Create a DataFrame
top_movies_df = pd.DataFrame({
    'Rank': ranks,
    'Title': titles,
    'Rel_Year': rel_years,
    'Rating': ratings,
    'movie_key': movie_key_hdr 
})

top_mov_actors_df = pd.DataFrame({
    'movie_key': movie_key_dtl,
    'actor': actors
})

Unnamed: 0,Rank,Title,Rel_Year,Rating,movie_key
0,#1,Parasite,2019,99,0
1,#2,Casablanca,1942,99,1
2,#3,All About Eve,1950,99,2
3,#4,On the Waterfront,1954,99,3
4,#5,Moonlight,2016,98,4
...,...,...,...,...,...
92,#93,Cavalcade,1933,66,92
93,#94,Out of Africa,1985,62,93
94,#95,The Greatest Show on Earth,1952,50,94
95,#96,Cimarron,1931,50,95


**List of top 97 movies with rating and year of release and movie name**

In [5]:
# Display the DataFrame
display(top_movies_df)

Unnamed: 0,Rank,Title,Rel_Year,Rating,movie_key
0,#1,Parasite,2019,99,0
1,#2,Casablanca,1942,99,1
2,#3,All About Eve,1950,99,2
3,#4,On the Waterfront,1954,99,3
4,#5,Moonlight,2016,98,4
...,...,...,...,...,...
92,#93,Cavalcade,1933,66,92
93,#94,Out of Africa,1985,62,93
94,#95,The Greatest Show on Earth,1952,50,94
95,#96,Cimarron,1931,50,95


**List of actors acted in the each movie of top 97 Movies with movie key and actor name** 

In [2]:
display(top_mov_actors_df)

Unnamed: 0,movie_key,actor
0,0,Song Kang-ho
1,0,Lee Sun-kyun
2,0,Jo Yeo-jeong
3,0,Choi Woo-sik
4,1,Humphrey Bogart
...,...,...
383,95,Nance O'Neil
384,96,Bessie Love
385,96,Anita Page
386,96,Charles King


**List of Actors acted in the top list movies**

In [3]:
actors_df = pd.DataFrame({'Act_id': range(top_mov_actors_df['actor'].nunique()), 'Act_name': top_mov_actors_df['actor'].unique()})
actors_df

Unnamed: 0,Act_id,Act_name
0,0,Song Kang-ho
1,1,Lee Sun-kyun
2,2,Jo Yeo-jeong
3,3,Choi Woo-sik
4,4,Humphrey Bogart
...,...,...
336,336,Nance O'Neil
337,337,Bessie Love
338,338,Anita Page
339,339,Charles King


**Data Frames stored into the csv files in the raw data folder**

In [4]:
top_mov_actors_df.to_csv('../data/raw/top_mov_actors.csv')
top_movies_df.to_csv('../data/raw/top_movie.csv')
actors_df.to_csv('../data/raw/actors_list.csv')
