### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [3]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
movie_data_url = f"{url}nyt_api_key={nyt_api_key}&fq={filter_query}&sort={sort}&fl={field_list}&begin_date={begin_date}&end_date={end_date}"
movie_data_url

'https://api.nytimes.com/svc/search/v2/articlesearch.json?nyt_api_key=SXopIKwZrvDQWFSKAK90XBaaAvfFh5L2&fq=section_name:"Movies" AND type_of_material:"Review" AND headline:"love"&sort=newest&fl=headline,web_url,snippet,source,keywords,pub_date,byline,word_count&begin_date=20130101&end_date=20230531'

In [4]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page in range(20):
    # create query with a page number
    movie_data_url_page = f"{url}api-key={nyt_api_key}&fq={filter_query}&sort={sort}&fl={field_list}&begin_date={begin_date}&end_date={end_date}&page={page}"
    
    # Send the GET request
    response = requests.get(movie_data_url_page)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        
        # Extract the articles
        articles = data['response']['docs']
        
        # Process each article
        for article in articles:
            # Store the article headline and URL in a dictionary
            review = {
                'headline': article['headline']['main'],
                'url': article['web_url']
            }
            # Append the dictionary to the reviews_list
            reviews_list.append(review)
        
        # Print the page number that was just retrieved
        print(f"Page {page + 1} retrieved successfully.")
        
        # Add a twelve-second interval between queries to stay within API query limits
        time.sleep(12)
    
    elif response.status_code == 429:
        # Handle rate limiting (status code 429) by waiting for a minute and then retrying
        print("Rate limit exceeded. Waiting for a minute...")
        time.sleep(60)
        # Retry the same page
        page -= 1  # Retry the current page
    else:
        # Print an error message if the request was not successful
        print(f"Error: Unable to retrieve data for page {page + 1}")


# Print the reviews_list
print(reviews_list)

# Convert the list to a Pandas DataFrame
df_reviews = pd.DataFrame(reviews_list)

# Display basic information about the DataFrame
print(df_reviews.info())

Page 1 retrieved successfully.
Page 2 retrieved successfully.
Page 3 retrieved successfully.
Page 4 retrieved successfully.
Page 5 retrieved successfully.
Page 6 retrieved successfully.
Page 7 retrieved successfully.
Page 8 retrieved successfully.
Page 9 retrieved successfully.
Page 10 retrieved successfully.
Page 11 retrieved successfully.
Page 12 retrieved successfully.
Page 13 retrieved successfully.
Page 14 retrieved successfully.
Page 15 retrieved successfully.
Page 16 retrieved successfully.
Page 17 retrieved successfully.
Page 18 retrieved successfully.
Page 19 retrieved successfully.
Page 20 retrieved successfully.
[{'headline': '‘The Attachment Diaries’ Review: Love, Sick', 'url': 'https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html'}, {'headline': 'Review: ‘What’s Love Got to Do With It?’ Probably a Lot', 'url': 'https://www.nytimes.com/2023/05/04/movies/whats-love-got-to-do-with-it-review.html'}, {'headline': '‘You Can Live Forever’ Review: Do You Lo

In [6]:
df_reviews.info()

Unnamed: 0,headline,url
0,"‘The Attachment Diaries’ Review: Love, Sick",https://www.nytimes.com/2023/05/25/movies/the-...
1,Review: ‘What’s Love Got to Do With It?’ Proba...,https://www.nytimes.com/2023/05/04/movies/what...
2,‘You Can Live Forever’ Review: Do You Love Me ...,https://www.nytimes.com/2023/05/04/movies/you-...
3,‘A Tourist’s Guide to Love’ Review: A Wearying...,https://www.nytimes.com/2023/04/21/movies/a-to...
4,‘Other People’s Children’ Review: True Romance,https://www.nytimes.com/2023/04/20/movies/othe...
...,...,...
195,Review: A Combustible Pair Find Love in ‘The O...,https://www.nytimes.com/2017/03/09/movies/the-...
196,"Review: Love as the World Wars, in ‘The Ottoma...",https://www.nytimes.com/2017/03/09/movies/revi...
197,Review: It’s All Mirth and Taxes in ‘Love & Ta...,https://www.nytimes.com/2017/03/02/movies/love...
198,"Review: ‘Everybody Loves Somebody,’ a Rom-Com ...",https://www.nytimes.com/2017/02/16/movies/ever...


In [10]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
# Assuming reviews_list is a list of dictionaries
first_5_results = reviews_list[:5]

# Use json.dumps to format the data with an indentation of 4 spaces
formatted_results = json.dumps(first_5_results, indent=4)

# Print the formatted results
print(formatted_results)

[
    {
        "headline": "\u2018The Attachment Diaries\u2019 Review: Love, Sick",
        "url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html"
    },
    {
        "headline": "Review: \u2018What\u2019s Love Got to Do With It?\u2019 Probably a Lot",
        "url": "https://www.nytimes.com/2023/05/04/movies/whats-love-got-to-do-with-it-review.html"
    },
    {
        "headline": "\u2018You Can Live Forever\u2019 Review: Do You Love Me Now?",
        "url": "https://www.nytimes.com/2023/05/04/movies/you-can-live-forever-review.html"
    },
    {
        "headline": "\u2018A Tourist\u2019s Guide to Love\u2019 Review: A Wearyingly Familiar Trip",
        "url": "https://www.nytimes.com/2023/04/21/movies/a-tourists-guide-to-love-review.html"
    },
    {
        "headline": "\u2018Other People\u2019s Children\u2019 Review: True Romance",
        "url": "https://www.nytimes.com/2023/04/20/movies/other-peoples-children-review.html"
    }
]


In [11]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
import pandas as pd
from pandas import json_normalize

# Assuming reviews_list is a list of dictionaries
df_reviews = json_normalize(reviews_list)

# Display the DataFrame
print(df_reviews)


                                              headline  \
0          ‘The Attachment Diaries’ Review: Love, Sick   
1    Review: ‘What’s Love Got to Do With It?’ Proba...   
2    ‘You Can Live Forever’ Review: Do You Love Me ...   
3    ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4       ‘Other People’s Children’ Review: True Romance   
..                                                 ...   
195  Review: A Combustible Pair Find Love in ‘The O...   
196  Review: Love as the World Wars, in ‘The Ottoma...   
197  Review: It’s All Mirth and Taxes in ‘Love & Ta...   
198  Review: ‘Everybody Loves Somebody,’ a Rom-Com ...   
199  Review: Cute Cats of ‘Kedi,’ Rekindling a ‘Lov...   

                                                   url  
0    https://www.nytimes.com/2023/05/25/movies/the-...  
1    https://www.nytimes.com/2023/05/04/movies/what...  
2    https://www.nytimes.com/2023/05/04/movies/you-...  
3    https://www.nytimes.com/2023/04/21/movies/a-to...  
4    https://www.n

In [13]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
df_reviews['title'] = df_reviews['headline'].str.extract(r'\u2018(.*?)(?:\u2019|$)').fillna('') + ' Review'

# Display the DataFrame
print(df_reviews[['headline', 'title']].head())


                                            headline  \
0        ‘The Attachment Diaries’ Review: Love, Sick   
1  Review: ‘What’s Love Got to Do With It?’ Proba...   
2  ‘You Can Live Forever’ Review: Do You Love Me ...   
3  ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4     ‘Other People’s Children’ Review: True Romance   

                           title  
0  The Attachment Diaries Review  
1                    What Review  
2    You Can Live Forever Review  
3               A Tourist Review  
4            Other People Review  


Unnamed: 0,headline,url,title
0,"‘The Attachment Diaries’ Review: Love, Sick",https://www.nytimes.com/2023/05/25/movies/the-...,The Attachment Diaries Review
1,Review: ‘What’s Love Got to Do With It?’ Proba...,https://www.nytimes.com/2023/05/04/movies/what...,What Review
2,‘You Can Live Forever’ Review: Do You Love Me ...,https://www.nytimes.com/2023/05/04/movies/you-...,You Can Live Forever Review
3,‘A Tourist’s Guide to Love’ Review: A Wearying...,https://www.nytimes.com/2023/04/21/movies/a-to...,A Tourist Review
4,‘Other People’s Children’ Review: True Romance,https://www.nytimes.com/2023/04/20/movies/othe...,Other People Review
...,...,...,...
195,Review: A Combustible Pair Find Love in ‘The O...,https://www.nytimes.com/2017/03/09/movies/the-...,The Other Half Review
196,"Review: Love as the World Wars, in ‘The Ottoma...",https://www.nytimes.com/2017/03/09/movies/revi...,The Ottoman Lieutenant Review
197,Review: It’s All Mirth and Taxes in ‘Love & Ta...,https://www.nytimes.com/2017/03/02/movies/love...,Love & Taxes Review
198,"Review: ‘Everybody Loves Somebody,’ a Rom-Com ...",https://www.nytimes.com/2017/02/16/movies/ever...,"Everybody Loves Somebody, Review"


In [14]:
# Extract 'name' and 'value' from items in "title" column
def extract_title_info(title):
    try:
        extracted_title = title.split('’', 1)[0].split('‘', 1)[-1]
        return f"{extracted_title} Review"
    except:
        return None

# Apply the function to the "title" column
df_reviews['title'] = df_reviews['headline'].apply(extract_title_info)

# Display the DataFrame with the updated "title" column
print(df_reviews[['headline', 'url', 'title']].head())


                                            headline  \
0        ‘The Attachment Diaries’ Review: Love, Sick   
1  Review: ‘What’s Love Got to Do With It?’ Proba...   
2  ‘You Can Live Forever’ Review: Do You Love Me ...   
3  ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4     ‘Other People’s Children’ Review: True Romance   

                                                 url  \
0  https://www.nytimes.com/2023/05/25/movies/the-...   
1  https://www.nytimes.com/2023/05/04/movies/what...   
2  https://www.nytimes.com/2023/05/04/movies/you-...   
3  https://www.nytimes.com/2023/04/21/movies/a-to...   
4  https://www.nytimes.com/2023/04/20/movies/othe...   

                           title  
0  The Attachment Diaries Review  
1                    What Review  
2    You Can Live Forever Review  
3               A Tourist Review  
4            Other People Review  


In [15]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
# Assuming df_reviews is your DataFrame
title_list = df_reviews['title'].to_list()

# Display the list of titles
print(title_list)



['The Attachment Diaries Review', 'What Review', 'You Can Live Forever Review', 'A Tourist Review', 'Other People Review', 'One True Loves Review', 'The Lost Weekend: A Love Story Review', 'A Thousand and One Review', 'Your Place or Mine Review', 'Love in the Time of Fentanyl Review', 'Pamela, a Love Story Review', 'In From the Side Review', 'After Love Review', 'Alcarràs Review', 'Nelly & Nadine Review', 'Lady Chatterley Review', 'The Sound of Christmas Review', 'The Inspection Review', 'Bones and All Review', 'My Policeman Review', 'About Fate Review', 'Waiting for Bojangles Review', 'I Love My Dad Review', 'A Love Song Review', 'Alone Together Review', 'Art of Love Review', 'The Wheel Review', 'Thor: Love and Thunder Review', 'Both Sides of the Blade Review', 'Fire of Love Review', 'Love & Gelato Review', 'Stay Prayed Up Review', 'Benediction Review', 'Dinner in America Review', 'In a New York Minute Review', 'Anaïs in Love Review', 'I Love America Review', 'See You Then Review', 'L

### Access The Movie Database API

In [16]:
TMDB_API_KEY = 'a67a85b6c6adc70be646faae9b9c1509'

# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [17]:
import requests
import time

# Your TMDb API key
TMDB_API_KEY = 'a67a85b6c6adc70be646faae9b9c1509'  # Replace with your actual TMDb API key

# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple of 50 requests
request_counter = 0

# Loop through the titles
for title in df_reviews['title'].to_list():
    # Check if we need to sleep before making a request
    if request_counter > 0 and request_counter % 50 == 0:
        print(f"Sleeping for 10 seconds to avoid rate limit...")
        time.sleep(10)

    # Add 1 to the request counter
    request_counter += 1

    # Remove the word "Review" from the title
    title_for_search = title.replace('Review', '').strip()

    # Perform a "GET" request for The Movie Database
    url = f'https://api.themoviedb.org/3/search/movie?api_key={TMDB_API_KEY}&query={title_for_search}&include_adult=false'
    response = requests.get(url)

    try:
        # Include a try clause to search for the full movie details.
        # Use the except clause to print out a statement if a movie
        # is not found.
        response.raise_for_status()
        data = response.json()

        # Check if results list is not empty
        if data['results']:
            # Get movie id
            movie_id = data['results'][0]['id']

            # Make a request for the full movie details
            url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={TMDB_API_KEY}&language=en-US'
            response = requests.get(url)

            # Extract relevant information and append it to the tmdb_movies_list
            movie_data = response.json()
            genres = [genre['name'] for genre in movie_data['genres']]
            languages = [lang['english_name'] for lang in movie_data['spoken_languages']]
            countries = [country['name'] for country in movie_data['production_countries']]
            tmdb_movies_list.append({
                'title': title,
                'genres': genres,
                'languages': languages,
                'countries': countries
            })

            print(f"Movie found: {title}")
        else:
            print(f"Movie not found: {title}")

    except requests.exceptions.HTTPError as err:
        print(f"Movie not found: {title}")
        print(f"Error: {err}")

# Display the list of movie details
print(tmdb_movies_list)


Movie found: The Attachment Diaries Review
Movie found: What Review
Movie found: You Can Live Forever Review
Movie found: A Tourist Review
Movie found: Other People Review
Movie found: One True Loves Review
Movie found: The Lost Weekend: A Love Story Review
Movie found: A Thousand and One Review
Movie found: Your Place or Mine Review
Movie found: Love in the Time of Fentanyl Review
Movie found: Pamela, a Love Story Review
Movie found: In From the Side Review
Movie found: After Love Review
Movie found: Alcarràs Review
Movie found: Nelly & Nadine Review
Movie found: Lady Chatterley Review
Movie found: The Sound of Christmas Review
Movie found: The Inspection Review
Movie found: Bones and All Review
Movie found: My Policeman Review
Movie found: About Fate Review
Movie found: Waiting for Bojangles Review
Movie found: I Love My Dad Review
Movie found: A Love Song Review
Movie found: Alone Together Review
Movie found: Art of Love Review
Movie found: The Wheel Review
Movie found: Thor: Love a

In [19]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
# Preview the first five results in JSON format
print(json.dumps(tmdb_movies_list[:5], indent=4))


[
    {
        "title": "The Attachment Diaries Review",
        "genres": [
            "Drama",
            "Mystery",
            "Thriller",
            "Horror"
        ],
        "languages": [
            "Spanish"
        ],
        "countries": [
            "Argentina"
        ]
    },
    {
        "title": "What Review",
        "genres": [
            "Comedy"
        ],
        "languages": [],
        "countries": [
            "United States of America"
        ]
    },
    {
        "title": "You Can Live Forever Review",
        "genres": [
            "Drama",
            "Romance"
        ],
        "languages": [
            "English",
            "French"
        ],
        "countries": [
            "Canada",
            "United States of America"
        ]
    },
    {
        "title": "A Tourist Review",
        "genres": [
            "Romance",
            "Comedy"
        ],
        "languages": [
            "English",
            "Vietnamese"
        ],
 

In [101]:
# Convert the results to a DataFrame
import pandas as pd

# Convert the results to a DataFrame
tmdb_df = pd.DataFrame(tmdb_movies_list)

# Display the DataFrame
print(tmdb_df)


                                title                              genres  \
0       The Attachment Diaries Review  [Drama, Mystery, Thriller, Horror]   
1                         What Review                            [Comedy]   
2         You Can Live Forever Review                    [Drama, Romance]   
3                    A Tourist Review                   [Romance, Comedy]   
4                 Other People Review                     [Comedy, Drama]   
..                                ...                                 ...   
191             The Other Half Review                    [Drama, Romance]   
192     The Ottoman Lieutenant Review               [Romance, Drama, War]   
193                 Review: It Review         [Horror, Mystery, Thriller]   
194  Everybody Loves Somebody, Review                   [Romance, Comedy]   
195                      Kedi, Review                           [Romance]   

                 languages                           countries  
0         

### Merge and Clean the Data for Export

In [108]:
# Merge the New York Times reviews and TMDB DataFrames on title
# Merge the New York Times reviews and TMDB DataFrames on title

# Merge the New York Times reviews and TMDB DataFrames on the 'title' column
merged_df = pd.merge(df_reviews, tmdb_df, on='title', how='inner')

# Display the merged DataFrame
print(merged_df.head())


                                            headline  \
0        ‘The Attachment Diaries’ Review: Love, Sick   
1  Review: ‘What’s Love Got to Do With It?’ Proba...   
2  ‘You Can Live Forever’ Review: Do You Love Me ...   
3  ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4     ‘Other People’s Children’ Review: True Romance   

                                                 url  \
0  https://www.nytimes.com/2023/05/25/movies/the-...   
1  https://www.nytimes.com/2023/05/04/movies/what...   
2  https://www.nytimes.com/2023/05/04/movies/you-...   
3  https://www.nytimes.com/2023/04/21/movies/a-to...   
4  https://www.nytimes.com/2023/04/20/movies/othe...   

                           title                              genres  \
0  The Attachment Diaries Review  [Drama, Mystery, Thriller, Horror]   
1                    What Review                            [Comedy]   
2    You Can Live Forever Review                    [Drama, Romance]   
3               A Tourist Review      

In [109]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame

# Create a list of the columns that need fixing
columns_to_fix = ['genres', 'languages', 'countries']

# Create a list of characters to remove
characters_to_remove = ['[', ']', "'"]

# Loop through the list of columns to fix
for column in columns_to_fix:
    # Convert the column to type 'str'
    merged_df[column] = merged_df[column].astype(str)
    
    # Loop through characters to remove
    for char in characters_to_remove:
        # Remove the character from the string using the Pandas str.replace() method
        merged_df[column] = merged_df[column].str.replace(char, '')

# Display the fixed DataFrame
print(merged_df.head())



                                            headline  \
0        ‘The Attachment Diaries’ Review: Love, Sick   
1  Review: ‘What’s Love Got to Do With It?’ Proba...   
2  ‘You Can Live Forever’ Review: Do You Love Me ...   
3  ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4     ‘Other People’s Children’ Review: True Romance   

                                                 url  \
0  https://www.nytimes.com/2023/05/25/movies/the-...   
1  https://www.nytimes.com/2023/05/04/movies/what...   
2  https://www.nytimes.com/2023/05/04/movies/you-...   
3  https://www.nytimes.com/2023/04/21/movies/a-to...   
4  https://www.nytimes.com/2023/04/20/movies/othe...   

                           title                            genres  \
0  The Attachment Diaries Review  Drama, Mystery, Thriller, Horror   
1                    What Review                            Comedy   
2    You Can Live Forever Review                    Drama, Romance   
3               A Tourist Review              

In [117]:
# Drop "byline.person" column
# Check the column names in the DataFrame
# Drop the correct column
# Print the column names
print(df_reviews.columns)


Index(['headline', 'url', 'title'], dtype='object')


In [115]:
# Delete duplicate rows and reset index
# Delete duplicate rows and reset index
df_reviews.drop_duplicates(inplace=True)
df_reviews.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
print(df_reviews.head())


                                            headline  \
0        ‘The Attachment Diaries’ Review: Love, Sick   
1  Review: ‘What’s Love Got to Do With It?’ Proba...   
2  ‘You Can Live Forever’ Review: Do You Love Me ...   
3  ‘A Tourist’s Guide to Love’ Review: A Wearying...   
4     ‘Other People’s Children’ Review: True Romance   

                                                 url  \
0  https://www.nytimes.com/2023/05/25/movies/the-...   
1  https://www.nytimes.com/2023/05/04/movies/what...   
2  https://www.nytimes.com/2023/05/04/movies/you-...   
3  https://www.nytimes.com/2023/04/21/movies/a-to...   
4  https://www.nytimes.com/2023/04/20/movies/othe...   

                           title  
0  The Attachment Diaries Review  
1                    What Review  
2    You Can Live Forever Review  
3               A Tourist Review  
4            Other People Review  


In [116]:
# Export data to CSV without the index
# Export data to CSV without the index
df_reviews.to_csv('nytimes_movie_reviews_clean.csv', index=False)
