In [None]:
pip install PyMovieDb

In [None]:
pip install rottentomatoes-python

In [None]:
pip install selenium

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%shell

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

In [None]:
!apt-get update
!apt-get install chromium chromium-driver

In [None]:
from PyMovieDb import IMDB
import rottentomatoes as rt
import json as js
import numpy as np
from google.colab import files
import pandas as pd
import time
from tqdm import tqdm, trange
from scipy.sparse import load_npz

imdb = IMDB()

In [None]:

train_csr = load_npz('/content/drive/MyDrive/recom/train_csr.npz')

In [None]:

rows, cols = train_csr.nonzero()
(train_csr.data).shape

In [None]:
sparce_df = pd.DataFrame(columns=['movie_id', 'user_id', 'rank'])
sparce_df

In [None]:
sparce_df['user_id'] = cols
sparce_df['movie_id'] = rows
sparce_df['rank'] = train_csr.data
sparce_df

In [None]:
movie_name_df = pd.read_csv('/content/drive/MyDrive/recom/movie_titles.txt', sep=',', encoding="ISO-8859-1", names=['index_', 'year', 'movie_name', 'extra_1', 'extra_2', 'extra_3'])
movie_name_df.year = movie_name_df.year.fillna(0)
movie_name_df.year = movie_name_df.year.astype(int)
movie_name_df

# Extracting metadata

In [None]:
# tomatometer rate
def extract_tomatometer_rate(df, start_index, end_index):
  tom_non_data = []
  tom_rate = []
  tom_rate_name = []
  for movie in tqdm(df['movie_name'][start_index: end_index]):
    try:
      tom_rate.append(rt.tomatometer(movie))
      tom_rate_name.append(movie)
    except:
      tom_non_data.append(movie)

  # print(f'rated: {tom_rate}')
  # print(f'non rated: {tom_non_data}')
  return tom_rate, tom_non_data, tom_rate_name

In [None]:
# IMDB data
def extract_imdb_data(df, start_index, end_index):
  IMDB_non_data = []
  IMDB_data = []
  IMDB_data_name = []
  for movie in tqdm(df['movie_name'][start_index: end_index]):
    try:

      temp = js.loads(imdb.get_by_name(movie, tv=False))
      assert temp['type'] == "Movie" #chacking 404 error
      IMDB_data.append(temp)
      IMDB_data_name.append(movie)
    except:
      try:
        res = js.loads(imdb.search(movie, tv=False))
        film = js.loads(imdb.get_by_id(res['results'][0]['id']))
        assert temp['type'] == "Movie" #chacking 404 error
        IMDB_data.append(temp)
        IMDB_data_name.append(movie)
      except:
        IMDB_non_data.append(movie)

  # print(f'IMDB data: {IMDB_data}')
  # print(f'non data: {IMDB_non_data}')
  # print(len(IMDB_data))
  return IMDB_data, IMDB_non_data, IMDB_data_name

In [None]:

for i in trange(0, movie_name_df.shape[0]+1, 10):
  # movie_name_df.shape[0]
  print(f'\n -----> batch {i} \n')
  tomatometer_rate, non_tomatometer_rate, tom_rate_name = extract_tomatometer_rate(movie_name_df, i, i + 10)
  
  imdb_data, imdb_non_data, imdb_data_name = extract_imdb_data(movie_name_df, i, i + 10)
  
  # save tomatometer data
  with open(f'/content/drive/MyDrive/recom/tomatometer_data/batch_{i}_tomatometer.txt', 'w') as f:
    f.write(str(tomatometer_rate))
  with open(f'/content/drive/MyDrive/recom/tomatometer_data/non_data/batch_{i}_non_tomatometer.txt', 'w') as f:
    f.write(str(non_tomatometer_rate))
  with open(f'/content/drive/MyDrive/recom/tomatometer_data/data_name/batch_{i}_name_tomatometer.txt', 'w') as f:
    f.write(str(tom_rate_name))
  
  # save imdb data
  with open(f'/content/drive/MyDrive/recom/imdb_data/batch_{i}_imdb.txt', 'w') as f:
    f.write(str(imdb_data))
  with open(f'/content/drive/MyDrive/recom/imdb_data/non_data/batch_{i}_non_imdb.txt', 'w') as f:
    f.write(str(imdb_non_data))
  with open(f'/content/drive/MyDrive/recom/imdb_data/data_name/batch_{i}_name_imdb.txt', 'w') as f:
    f.write(str(imdb_data_name))

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver


In [None]:

def extract_google_rate(movie, try_count):
  try:
    driver = web_driver()
    driver.get('https://www.google.com')
    element = driver.find_element(By.ID, "APjFqb")
    element.send_keys(movie)
    element.send_keys(Keys.RETURN)
    element = driver.find_element(By.CSS_SELECTOR, 'div.a19vA')
    rate = element.text
    driver.quit()
    return int(rate[:2])
  except:
    driver.quit()
    if try_count < 10:
        try_count += 1
        time.sleep(2)
        return extract_google_rate(movie, try_count)
    else:
        return 0


In [None]:
def get_google_data(df, start_index, end_index):  
  google_non_data = []
  google_rate = []
  movie_name = []
  temp_count = start_index
  for movie in tqdm(df['movie_name'][start_index: end_index]):
    # print(f'---> {movie}')
    try:
      try_count = 0
      temp = 0
      movie_full = ''
      movie_full = movie + ' movie ' + str(df.year[temp_count])
      # print(f'---> {movie_full}')
      temp = extract_google_rate(movie_full, try_count)
      if temp == 0:
        temp = extract_google_rate(movie + ' movie ', 9)
        if temp == 0:
          google_non_data.append(movie)
      
      google_rate.append(temp)
      movie_name.append(movie)
        
    except:
      google_non_data.append(movie)
    
    temp_count += 1

  print(f'rated: {google_rate}')
  print(f'name: {movie_name}')
  print(f'non rated: {google_non_data}')

  return google_rate, google_non_data, movie_name

In [None]:
for i in trange(0, movie_name_df.shape[0] + 1, 10):
  # movie_name_df.shape[0] + 1
  print(f'\n -----> batch {i} \n')
  google_rate, google_non_data, movie_name_google = get_google_data(movie_name_df, i, i + 10)
  
  # save google data
  with open(f'/content/drive/MyDrive/recom/google_data/batch_{i}_google.txt', 'w') as f:
    f.write(str(google_rate))
  with open(f'/content/drive/MyDrive/recom/google_data/non_data/batch_{i}_non_google.txt', 'w') as f:
    f.write(str(google_non_data))
  with open(f'/content/drive/MyDrive/recom/google_data/data_name/batch_{i}_name_google.txt', 'w') as f:
    f.write(str(movie_name_google))