In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from google.colab import files

files.upload()

In [None]:
!ls -lha kaggle.json


In [4]:
!pip install -q kaggle

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [10]:
!kaggle datasets download -d akshaypawar7/millions-of-movies

Downloading millions-of-movies.zip to /content
 96% 158M/164M [00:02<00:00, 83.2MB/s]
100% 164M/164M [00:02<00:00, 66.5MB/s]


In [4]:
import pandas as pd 
import datetime
import regex as re
import concurrent.futures
import requests
import math
import zipfile

In [None]:
file_path = "millions-of-movies.zip"
zip_file = zipfile.ZipFile(file_path, 'r')
zip_file.extractall()
zip_file.close()


In [5]:
movie_df = pd.read_csv('movies.csv')

In [6]:
movie_df.drop(['id','keywords','poster_path','backdrop_path','recommendations'],axis =1 ,inplace=True)

In [7]:
movie_df['release_date'] =pd.to_datetime(movie_df['release_date'])
today = pd.to_datetime(datetime.date.today())
filtered_movie_df = movie_df.query('release_date >= 2018 and release_date <= @today and original_language.isin(["en","zh"]) and status == "Released" and genres.isna() == False').drop_duplicates(['title'])

In [8]:
filtered_movie_df['title_rename'] = filtered_movie_df['title'].astype(str).apply(lambda title: re.sub(r'[^A-Za-z0-9 ]+', '', title).replace(' ','_').lower())
filtered_movie_df.reset_index(drop=True, inplace=True)

In [14]:
session = requests.Session()

def fetch_scores(title):
    try:
        url = f'https://rotten-tomatoes-api.ue.r.appspot.com/movie/{title}'
        response = session.get(url)
        response.raise_for_status()
        js = response.json()
        return js['name'],js['tomatometer'], js['audience_score'],js['weighted_score'],js['genres'],js['year']
    except (KeyError, requests.HTTPError, ValueError):
        return None, None, None, None, None, None

batch_size = 1000
num_batches = math.ceil(len(filtered_movie_df) / batch_size)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        batch_df = filtered_movie_df[start_index:end_index]

        results = list(executor.map(fetch_scores, batch_df['title_rename']))

        name,tomatometer_scores, audience_scores,weighted_score,rotten_tomato_genres,year = zip(*results)
        batch_df['movie_name'] = name
        batch_df['tomatometer'] = tomatometer_scores
        batch_df['audience_score'] = audience_scores
        batch_df['weighted_score'] = weighted_score
        batch_df['rotten_tomato_genres'] = rotten_tomato_genres
        batch_df['year'] = year

        batch_df.to_csv(f'batch_{i+1}.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['movie_name'] = name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['tomatometer'] = tomatometer_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['audience_score'] = audience_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [29]:
batch_dfs = {i: pd.read_csv(f'batch_{i}.csv') for i in range(1, num_batches + 1)}
tmdb_rotten_df = pd.concat([batch_dfs[i] for i in range(1, num_batches + 1)])


In [33]:
tmdb_rotten_df.drop(['genres','title_rename','title','year'],axis=1,inplace=True)
tmdb_rotten_df.query('tomatometer.isna() == False & audience_score.isna() == False & rotten_tomato_genres.isna() == False', inplace=True)

In [None]:
tmdb_rotten_df.to_csv('movies_with_rotten_tomatoes_v1.csv', index=False)