# Review Scrapping

### import required packages

In [44]:
import requests
import pandas as pd
from tqdm import tqdm
from langdetect import detect
import time
import os
import csv
from transformers import pipeline
from collections import Counter

### Required configuration

In [2]:
API_KEY='35250d55ae36d4d2136a5c928021acc1'    
DISCOVER_PAGES=300         
POPULAR_PAGES=100          
MAX_REVIEW_PAGES=20        
MAX_REVIEWS=60000          
SAVE_EVERY=2000

#### Movie Fatching

In [3]:
def get_discover_movies(api_key, total_pages):
    movies=set()
    for page in tqdm(range(1, total_pages + 1), desc="Fetching discover movies"):
        url='https://api.themoviedb.org/3/discover/movie'
        params={'api_key': api_key,
            'language': 'en-US',
            'sort_by': 'vote_count.desc',
            'vote_count.gte': 50,
            'page': page}
        response=requests.get(url, params=params)
        if response.status_code != 200:
            break
        for movie in response.json().get('results', []):
            movies.add((movie['id'], movie['title']))
        time.sleep(0.1)
    return movies

##### Popular movie fatching

In [4]:
def get_popular_movies(api_key, total_pages):
    movies=set()
    for page in tqdm(range(1, total_pages + 1), desc="Fetching popular movies"):
        url='https://api.themoviedb.org/3/movie/popular'
        params={'api_key': api_key,
            'language': 'en-US',
            'page': page}
        response=requests.get(url, params=params)
        if response.status_code != 200:
            break
        for movie in response.json().get('results', []):
            movies.add((movie['id'], movie['title']))
        time.sleep(0.1)
    return movies

### Movie data Fatching

In [5]:
def get_reviews(movie_id, movie_title, api_key, max_pages):
    reviews=[]
    for page in range(1, max_pages + 1):
        url=f'https://api.themoviedb.org/3/movie/{movie_id}/reviews'
        params={'api_key': api_key,
            'language': 'en-US',
            'page': page}
        response=requests.get(url, params=params)
        if response.status_code != 200:
            break
        results=response.json().get('results', [])
        if not results:
            break
        for r in results:
            try:
                text=r.get('content', '')
                if detect(text) != 'en':
                    continue
                    reviews.append({
                    'movie_id': movie_id,
                    'movie_title': movie_title,
                    'author': r.get('author', ''),
                    'review': text,
                    'rating': r.get('author_details', {}).get('rating', None)
                })
            except:
                continue
        time.sleep(0.1)
    return reviews

### Scrapping

In [6]:
def scrape_reviews(api_key, max_reviews, save_every):
    discover_movies=get_discover_movies(api_key, DISCOVER_PAGES)
    popular_movies=get_popular_movies(api_key, POPULAR_PAGES)
    all_movies=list(discover_movies.union(popular_movies))
    all_reviews=[]
    for movie_id, movie_title in tqdm(all_movies, desc="Scraping reviews"):
        reviews=get_reviews(movie_id, movie_title, api_key, MAX_REVIEW_PAGES)
        all_reviews.extend(reviews)
        if len(all_reviews) >= save_every and len(all_reviews) % save_every < len(reviews):
            df=pd.DataFrame(all_reviews)
            df.drop_duplicates(subset='review', inplace=True)
            filename=f'tmdb_reviews_partial_{len(df)}.csv'
            df.to_csv(filename, index=False)
            print(f"\n Saved: {filename}")
        if len(all_reviews) >= max_reviews:
            break
    df=pd.DataFrame(all_reviews)
    df.drop_duplicates(subset='review', inplace=True)
    df.to_csv('D:/DBDA/Machine_Learning_Project_dataset/tmdb_reviews_dataset2.csv', index=False)

### Run the main file to save the data

In [None]:
if __name__=='__main__':
    scrape_reviews(API_KEY, MAX_REVIEWS, SAVE_EVERY)

## Again scrap reviews for more data 

### Required configuration

In [9]:
API_KEY="35250d55ae36d4d2136a5c928021acc1"
MOVIE_PAGES=500  
REVIEW_PAGES_PER_MOVIE=20
CSV_FILE_NAME="D:/DBDA/Machine_Learning_Project_dataset/tmdb_clean_english_reviews.csv"

### Open the csv file with headers

In [10]:
 with open(CSV_FILE_NAME, 'w', newline='', encoding='utf-8') as f:
        writer=csv.DictWriter(f, fieldnames=["movie_id", "movie_title", "author", "review", "rating"])
        writer.writeheader()

### movie fatching

In [11]:
def get_movies(api_key, total_pages):
    movies=[]
    seen_ids=set()
    for page in tqdm(range(1, total_pages + 1), desc="Fetching movies"):
        url="https://api.themoviedb.org/3/discover/movie"
        params={"api_key": api_key,
            "language": "en-US",
            "sort_by": "popularity.desc",
            "vote_count.gte": 10,
            "page": page}
        response=requests.get(url, params=params)
        if response.status_code != 200:
            break
        for movie in response.json().get("results", []):
            if movie["id"] not in seen_ids:
                movies.append((movie["id"], movie["title"]))
                seen_ids.add(movie["id"])
        time.sleep(0.1)
    return movies

### Fatch movie data

In [12]:
def get_reviews(movie_id, movie_title, api_key, max_pages=20):
    reviews=[]
    for page in range(1, max_pages + 1):
        url = f"https://api.themoviedb.org/3/movie/{movie_id}/reviews"
        params = {"api_key": api_key,
            "language": "en-US",
            "page": page}
        response=requests.get(url, params=params)
        if response.status_code != 200:
            break
        results=response.json().get("results", [])
        if not results:
            break
        for review in results:
            content=review.get("content", "")
            try:
                if detect(content)=="en":
                    reviews.append({"movie_id": movie_id,
                        "movie_title": movie_title,
                        "author": review.get("author", ""),
                        "review": content,
                        "rating": review.get("author_details", {}).get("rating")})
            except:
                continue
        time.sleep(0.1)
    return reviews

### Run main file to download data

In [None]:
movies=get_movies(API_KEY, MOVIE_PAGES)
for movie_id, movie_title in tqdm(movies, desc="Scraping reviews"):
    reviews=get_reviews(movie_id, movie_title, API_KEY, REVIEW_PAGES_PER_MOVIE)
    if reviews:
        with open(CSV_FILE_NAME, mode='a', newline='', encoding='utf-8') as file:
            writer=csv.DictWriter(file, fieldnames=["movie_id", "movie_title", "author", "review", "rating"])
            for review in reviews:
                writer.writerow(review)
print("\nReviews saved to:", CSV_FILE_NAME)

## load the dataset

In [52]:
df1=pd.read_csv('D:/DBDA/Machine_Learning_Project_dataset/tmdb_reviews_dataset2.csv') 
df2=pd.read_csv('D:/DBDA/Machine_Learning_Project_dataset/tmdb_clean_english_reviews.csv') 
df=pd.read_csv('D:/DBDA/Machine_Learning_Project_dataset/IMDB Dataset.csv')

In [22]:
df1.head()

Unnamed: 0,movie_id,author,review,rating,movie_title
0,8452,John Chard,If you really believe that then you should clo...,6.5,The 6th Day
1,8452,Gimly,_The 6th Day_ is a **great** title for a movie...,6.0,The 6th Day
2,17578,ohlalipop,The animation was something else. It looked so...,10.0,The Adventures of Tintin
3,17578,lmao7,Wow…they just took cg/motion/performance captu...,9.0,The Adventures of Tintin
4,351460,.,_Death Note_ - _★★★★_\r\n\r\nWhile different f...,8.0,Death Note


In [23]:
df1_1=df1.drop(['movie_id','author','movie_title'],axis=1)

In [24]:
df2.head(5)

Unnamed: 0,movie_id,movie_title,author,review,rating
0,950387,A Minecraft Movie,tmdb98094809,"Alright, buckle up, because I just saw a movie...",10.0
1,950387,A Minecraft Movie,CinemaSerf,Who doesn’t like a white woolly llama? Well th...,6.0
2,950387,A Minecraft Movie,CinemaSerf,Who doesn’t like a white woolly llama? Well th...,6.0
3,950387,A Minecraft Movie,Jm_15,Chickey Jockey is so fun to watch. I love it s...,
4,574475,Final Destination Bloodlines,CinemaSerf,Imagine if your room-mate kept getting recurri...,7.0


In [30]:
df2_2=df2.drop(['movie_id','movie_title','author'],axis=1)

### Merge all scrap data

In [31]:
df3=pd.concat([df1_1,df2_2])

In [33]:
df4=df3.drop_duplicates()

In [34]:
df4.shape

(17234, 2)

### Use HuggingFace transformer to find sentiment of reviews

In [45]:
sentiment_pipeline=pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device=-1, truncation=True)

def chunk_text(text, chunk_size=500):
    """Splits text into chunks of size 500 characters so that no error occurs as huggingface transformer has limit of 512 only."""
    text = str(text)
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def analyze_chunks(review):
    try:
        chunks=chunk_text(review)
        sentiments=[]
        for chunk in chunks:
            label=sentiment_pipeline(chunk)[0]['label'].lower()
            sentiments.append(label)
        return Counter(sentiments).most_common(1)[0][0]
    except Exception as e:
        return "error"
df4=df4.reset_index(drop=True)
df6=df4.copy()
df6['sentiment']=df4['review'].progress_apply(analyze_chunks)

Device set to use cpu
Classifying sentiment: 100%|███████████████████████████████████████████████████| 17234/17234 [1:22:07<00:00,  3.50it/s]


In [46]:
df6['sentiment'].value_counts()

sentiment
positive    11245
negative     5989
Name: count, dtype: int64

In [48]:
df6.drop("rating",axis=1,inplace=True)

### Merge all data frame to a single one

In [59]:
df7=pd.concat([df,df6])

In [65]:
df7.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7.drop_duplicates(inplace=True)


In [66]:
df7.shape

(66797, 2)

### convert data frame to a csv file

In [67]:
df7.to_csv("D:/DBDA/Machine_Learning_Project_dataset/Final_movie_reviews_dataset.csv",index=False)