In [1]:
!cp /content/drive/MyDrive/Top_10000_Movies.csv -d /content

In [2]:
import math
import json
import requests
import itertools
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
import re
import ast

In [3]:
movies_df = pd.read_csv('/content/Top_10000_Movies.csv', lineterminator='\n')
# drop rows with no genre
movies_df = movies_df[movies_df['genre'] != '[]'] # remove movies which have no genres
movies_df = movies_df.reset_index(drop=True)
movies_df = movies_df[movies_df['original_language'] == 'en'] # keep only english movies, we only want one language for our scraping
movies_df = movies_df.reset_index(drop=True)

In [4]:
movies_df.sort_values(by='release_date', ascending = False, inplace=True) # sort by release dates, allows us to scrape more recent movies, making our IR system more relevant
movies_df = movies_df.reset_index(drop=True)
movies_df = movies_df.drop([i for i in range(12)] + [13, 15, 16, 17, 18, 22, 25]) # remove first few as they have yet to be released
movies_df = movies_df.reset_index(drop=True)
movies_df = movies_df[movies_df['release_date'].notna()] # drop additional movies which have not been released
movies_df = movies_df.reset_index(drop=True)

In [5]:
# parse movie titles with : in the name
movies_df['original_title'] = movies_df['original_title'].apply(lambda x: " ".join(x.split()[1:]) if (len(x.split()) != 1 and x.split()[0][-1] == ":")  else " ".join(x.split()[0:])  )
movies_df = movies_df.reset_index(drop=True)
# movies_df['original_title'] = movies_df['original_title'].apply(lambda x: " ".join(x.split()[2:]) if (len(x.split()) > 2 and x.split()[1][-1] == ":")  else " ".join(x.split()[0:])  )
# movies_df = movies_df.reset_index(drop=True)

In [6]:
# convert string representation of a list of dictionaries to a list of dictionaries
movies_df["genre"] = movies_df["genre"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

filter_genre = [['Documentary', 'Musical']]  # remove any documentaries if any 
movies_df = movies_df[~movies_df['genre'].isin(filter_genre)]

In [7]:
movies_df = movies_df[(movies_df['popularity'] > 30) | (movies_df['original_title'].str.count(' ') != 0)] # remove movies titles with only 1 word as they often mean something very general, but if they are popular enough (popularity > 30) just keep them
movies_df = movies_df.reset_index(drop=True)

movies_df = movies_df[(movies_df['popularity'] > 10)] # remove unpopular and obscure movies
movies_df = movies_df.reset_index(drop=True)

# movies_df = movies_df[movies_df['original_title'].str.count(' ') != 0 ] # remove titles with only 1 word, as this causes a lot of ambiguity in comments
# movies_df = movies_df.reset_index(drop=True)
# movies_df.sort_values(by='popularity', ascending = True, inplace=True) # sort by release dates
# movies_df = movies_df.reset_index(drop=True)

movies_df

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,popularity,release_date,vote_average,vote_count,genre,overview,revenue,runtime,tagline
0,883,76600,en,Avatar 2,58.466,2022-12-14,0.0,0,"[Action, Adventure, Science Fiction, Fantasy]",Twelve years after exploring Pandora and joini...,0,0.0,
1,4917,505642,en,Black Panther: Wakanda Forever,17.849,2022-11-09,0.0,0,"[Action, Adventure, Science Fiction]",The sequel to Black Panther (2018).,0,0.0,
2,6048,436270,en,Black Adam,14.386,2022-07-27,0.0,0,"[Action, Fantasy, Adventure]",Black Adam is an upcoming American superhero f...,0,0.0,The hierarchy of power in the DC Universe is a...
3,3090,616037,en,Love and Thunder,27.361,2022-07-07,0.0,0,"[Action, Adventure, Fantasy, Comedy, Science F...",The fourth installment of the Marvel Studios' ...,0,0.0,
4,1894,438148,en,The Rise of Gru,40.945,2022-06-23,0.0,0,"[Family, Animation, Action, Adventure, Comedy]",A fanboy of a supervillain supergroup known as...,0,0.0,Brace yourself.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5841,9220,33680,en,Grand Hotel,10.104,1932-05-25,7.0,207,"[Drama, Romance]",Guests at a posh Berlin hotel struggle through...,2594000,112.0,Thank the stars for a great entertainment!
5842,4989,901,en,City Lights,17.014,1931-02-01,8.4,1507,"[Comedy, Drama, Romance]","In this sound-era silent film, a tramp falls i...",4250000,87.0,True Blind Love
5843,8054,962,en,The Gold Rush,12.772,1925-07-12,8.0,1099,"[Adventure, Comedy, Drama]",A lone prospector ventures into Alaska looking...,4000000,89.0,The World's Greatest Laughing Picture!
5844,9564,992,en,Sherlock Jr.,10.763,1924-04-17,8.2,605,"[Action, Comedy, Mystery]","A film projectionist longs to be a detective, ...",0,45.0,Every inch of footage holds such a laugh!


In [8]:
# get the top k most recent movies
movies = movies_df[:1500] 

In [21]:
def get_pushshift_data(query, **kwargs):
    """
    Gets data from the pushshift api.
 
    args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    """

    base_url = f'https://api.pushshift.io/reddit/search/comment/?q="{query}"&subreddit=movies&score=2&'
    payload = kwargs
    #request = requests.get(base_url, params=payload)

    def fire_away(base_url):
      request = requests.get(base_url, params=payload)
      assert request.status_code == 200
      return request.json()

      max_retries = 100
      current_tries = 1
      while current_tries < max_retries:
          try:
              time.sleep(3)
              request = fire_away(uri)
              return request
          except:
              time.sleep(3)
              current_tries += 1
    return fire_away(base_url)

In [None]:
data_list = []
result = {'data': []}

for index, row in movies.iterrows():
    movie_title = row["original_title"].lstrip().rstrip()
    movie_genres = row["genre"]

    genre_names = [genre for genre in movie_genres]
    
    genre_string = ', '.join(genre_names)
    
    try:
      response = get_pushshift_data(     
                        query=movie_title,                 
                        after='3650d',           # set search date far enough back for movies made in 2016 etc...
                        size='100')
      time.sleep(3)
      print(len(response['data']))
    except:
      time.sleep(3)
      continue

    for r in response['data']:
        r['movie_name'] = movie_title
        r['genre'] = genre_string
    data_list.append(response)

for data in data_list:
    result['data'].extend(data['data'])

In [40]:
print(movie_title, movie_title, index) # stopped early as enough data was obtained. stopped at index 1068

Our Friend Our Friend 1068


In [39]:
row

Unnamed: 0                                                        1637
id                                                              583903
original_language                                                   en
original_title                                              Our Friend
popularity                                                      31.869
release_date                                                2019-02-25
vote_average                                                       7.2
vote_count                                                         168
genre                                         [Drama, Comedy, Romance]
overview             After learning that his terminally ill wife ha...
revenue                                                              0
runtime                                                          124.0
tagline                                                            NaN
Name: 1068, dtype: object

In [30]:
# for data in data_list:
#     result['data'].extend(data['data'])

In [31]:
rows = [{'movie_name': row['movie_name'],
         'genre': row['genre'],
         'author': row['author'],
         'score': row['score'],
         'body': row['body'],

         'link_id': row['link_id'],
         'utc_datetime_str': row['utc_datetime_str'],
        }for row in result['data']]

In [32]:
reddit = pd.DataFrame(rows)
reddit

Unnamed: 0,movie_name,genre,author,score,body,link_id,utc_datetime_str
0,Avatar 2,"Action, Adventure, Science Fiction, Fantasy",Whale31777,1,“I’m just happy to see you” Avatar 2.,t3_10z5t71,2023-02-11 06:01:19
1,Avatar 2,"Action, Adventure, Science Fiction, Fantasy",Firvulag,1,&gt; tl:dr - the hype was always just about th...,t3_10z61yi,2023-02-11 05:08:32
2,Avatar 2,"Action, Adventure, Science Fiction, Fantasy",njdevils901,1,holy shit someone on /r/movies understands why...,t3_10za4qa,2023-02-11 04:17:46
3,Avatar 2,"Action, Adventure, Science Fiction, Fantasy",Icy-Exchange6457,1,Avatar 2. Absolutely loved the first one and I...,t3_10z2ctw,2023-02-11 03:05:39
4,Avatar 2,"Action, Adventure, Science Fiction, Fantasy",whtsths0,1,"Yeah, it's so dumb. I got an email that ticket...",t3_10yjp2u,2023-02-11 01:15:36
...,...,...,...,...,...,...,...
62384,Greta,"Drama, Mystery, Thriller, Horror",MrShoggoth,1,I was disappointed Greta Gerwig wasn’t bigger ...,t3_100r6hx,2023-01-02 04:05:20
62385,Greta,"Drama, Mystery, Thriller, Horror",Oxy_1993,1,I saw this movie in the theatre the first time...,t3_zzjk0v,2023-01-02 02:53:39
62386,Greta,"Drama, Mystery, Thriller, Horror",Bussaca,1,007 Dr.No \n\nI dont really remember Dr.No doi...,t3_100t4tm,2023-01-02 00:44:09
62387,Greta,"Drama, Mystery, Thriller, Horror",finditplz1,1,"Charlie Hunnam, Carey Mulligan, Gretchen Mol, ...",t3_100r6hx,2023-01-02 00:17:58


In [33]:
# drop rows that contains 'bot' in author column
reddit_no_bots = reddit.loc[~reddit['author'].str.contains('bot', case=False), :]

In [35]:
# drop rows that contains 'automoderator' in author column
reddit_no_bots = reddit_no_bots.loc[~reddit_no_bots['author'].str.contains('AutoModerator', case=False), :]

In [49]:
len(reddit_no_bots)

61993

In [47]:
reddit_no_bots = reddit_no_bots.drop(['link_id','utc_datetime_str', 'author','score'], axis=1) # remove additional columns

In [48]:
reddit_no_bots.to_csv('reddit_pushshift_remove_bots_LargeV2.csv')