In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/DS'

/content/drive/MyDrive/DS


In [3]:
!pip install requests_cache



In [4]:
import requests as req
import requests_cache
import time
import json
import numpy as np
import pandas as pd
from random import *

In [5]:
requests_cache.install_cache()

In [6]:
# k_p01dgqso : Top250Movies, bottom : [100 -> 220)
# k_rjf0le7o : bottom : [:100]
# k_3hi7jya4 : Top250Movies [0 -> 100)

* get_data function with parameters are api_key, start_ and end_

In [7]:
def get_data(api_key,reviews,lst_id,start_=0,end_=100):
  for id in lst_id[start_:end_]:
    url = f'https://imdb-api.com/en/API/Reviews/{api_key}/{id}'
    review = req.get(url)
    if not review.from_cache:
      time.sleep(1)
    review = json.loads(review.text)
    reviews.extend(review['items'])
  return reviews

**GET DATA FROM 250 BOTTOM RATED FILMS**

* Read file 'id_.txt', which stores id of films that have the lowest rating point, contains 250 ids. (Because of some restrictions, we cannot get these data by parsing HTML or using API so we get it by hand and save it in a text file).

In [8]:
lst_id = []
with open('id_.txt') as f:
  for l in f:
    lst_id.append(l.split('\n')[0])

* Each id, invokes a get request to IMDb using API to get reviews and then stores these reviews into a list.

In [9]:
reviews = []
reviews = get_data('k_rjf0le7o',reviews,lst_id)

In [10]:
reviews = get_data('k_p01dgqso',reviews,lst_id,100,220)

**GET DATA FROM 250 TOP RATED FILMS**

* Send a get request to IMDb to get a list of top-rated films.

In [11]:
url = 'https://imdb-api.com/en/API/Top250Movies/k_p01dgqso'
r=req.get(url)
data=r.text
r.from_cache

True

* Keep only the id of films from the result above then store it in list _id.

In [12]:
data = json.loads(data)
data = data['items']
id_ = []
for item in data:
  id_.append(item['id'])

* Use only top 100 films, invokes get_data function to get film's reviews then append into a list reviews.

In [13]:
lst_id = id_[:100]

In [14]:
reviews = get_data('k_3hi7jya4',reviews,lst_id)

**PREPROCESS DATA**

* To avoid data from being skewed, we keep only first approximately 7000 reviews that our team can collect.

In [15]:
get_ = reviews[:7100]

* Shuffle data in order to eliminate the clear separation of good and bad reviews clusters.

In [16]:
seed(4)
shuffle(get_)

* Store necessary information of review such as: title, content, and rate into corresponding lists.

In [17]:
titles, contents , rates = [],[],[]
for r in get_:
  if r['rate'] != '':
    rates.append(float(int(r['rate'])))
  else:
    rates.append(np.nan)
  if r['title'] != '':
    titles.append(r['title'])
  else:
    titles.append(np.nan)
  if r['content'] != '':
    contents.append(r['content'])
  else:
    contents.append(np.nan)

* Create DataFrame based on data above, then remove rows that have an empty value.

In [18]:
df = pd.DataFrame(columns = ['Title','Content','Rate'])
df.Title = titles
df.Content = contents
df.Rate = rates
df = df.dropna()

In [19]:
df.head(5)

Unnamed: 0,Title,Content,Rate
0,"A crime against humanity, it's like a form of ...",This film is so bad that it would be preferabl...,1.0
2,4.0 what the hell,how can this film get a higher rating than par...,2.0
3,don't even think about it...,"What shall I say, seeing the trailer was bad e...",1.0
4,It's not that bad,I barely understand why this film was harshly ...,5.0
6,What a goofy film!,"Clumsy comedy with cheesy ""special effects"" (i...",3.0


* Convert values in Rate column from numerical to categorical (contains 2 values: Good which has rate greater than 5 and Bad for the otherwise).

In [20]:
temp = [0,1,2,3,4,5]
df.loc[df.Rate > 5, 'Rate'] = 'Good'
df.loc[df.Rate.isin(temp), 'Rate'] = 'Bad'

In [21]:
df.head(5)

Unnamed: 0,Title,Content,Rate
0,"A crime against humanity, it's like a form of ...",This film is so bad that it would be preferabl...,Bad
2,4.0 what the hell,how can this film get a higher rating than par...,Bad
3,don't even think about it...,"What shall I say, seeing the trailer was bad e...",Bad
4,It's not that bad,I barely understand why this film was harshly ...,Bad
6,What a goofy film!,"Clumsy comedy with cheesy ""special effects"" (i...",Bad


* Remove duplicate rows.

In [22]:
df.drop_duplicates(inplace=True)

* Check whether the data is skewed.

In [23]:
df.Rate.value_counts()

Good    2931
Bad     2902
Name: Rate, dtype: int64

* Save dataframe to excel file.

In [24]:
df.to_excel('./data/data_rates_cate.xlsx',index=False)