In [1]:
!kaggle datasets download -d ebiswas/imdb-review-dataset

Downloading imdb-review-dataset.zip to /home/jupyter
 99%|██████████████████████████████████████▋| 2.67G/2.69G [00:21<00:00, 134MB/s]
100%|███████████████████████████████████████| 2.69G/2.69G [00:21<00:00, 132MB/s]


In [2]:
path_to_zip_file = '/home/jupyter/imdb-review-dataset.zip'
directory_to_extract_to = '/home/jupyter/dataset'

import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [4]:
# Import dependencies
import json # reading in source file
import re # regular expressions
import time
import pandas as pd

In [5]:
# Load data into dataframe
dataDir = "/home/jupyter/dataset/" 
files_to_load = ["part-01.json", "part-02.json", "part-03.json", "part-04.json", "part-05.json", "part-06.json"]
reviews_json = list()

startTime = time.time()
for current_file in files_to_load:
    print(f"Starting load of {current_file}...")
    with open(f"{dataDir}/{current_file}", mode='r') as file:
        new_reviews = json.load(file)
        for review in new_reviews:
            reviews_json.append(review)
    print(f"Finished load of {current_file} at {time.time() - startTime:.2f} total seconds elapsed")
print(f"Loading complete after {time.time() - startTime:.2f} seconds, {len(reviews_json):,} items in reviews_json")

reviews_total_count = len(reviews_json)
print(f"Total reviews in the working data: {reviews_total_count:,}")

# Convert list of dicts to pandas dataframe
reviews_df = pd.DataFrame(reviews_json)

Starting load of part-01.json...
Finished load of part-01.json at 16.87 total seconds elapsed
Starting load of part-02.json...
Finished load of part-02.json at 36.07 total seconds elapsed
Starting load of part-03.json...
Finished load of part-03.json at 57.05 total seconds elapsed
Starting load of part-04.json...
Finished load of part-04.json at 76.63 total seconds elapsed
Starting load of part-05.json...
Finished load of part-05.json at 101.34 total seconds elapsed
Starting load of part-06.json...
Finished load of part-06.json at 109.69 total seconds elapsed
Loading complete after 109.69 seconds, 5,571,499 items in reviews_json
Total reviews in the working data: 5,571,499


In [6]:
len(reviews_df['movie'])

5571499

In [7]:
len(set(reviews_df['movie']))

453528

In [8]:
#필요 없는 column 삭제 & 100개 이상의 리뷰를 가진 영화만 선정

In [9]:
reviews_df.drop(['review_id', 'reviewer', 'review_date', 'spoiler_tag', 'helpful'], axis=1, inplace=True)
reviews_df

Unnamed: 0,movie,rating,review_summary,review_detail
0,After Life (2019– ),9,Very Strong Season 2,"I enjoyed the first season, but I must say I t..."
1,The Valhalla Murders (2019– ),6,Icelandic detectives?,I know Iceland is a small country and police d...
2,Special OPS (2020– ),7,Nothing special,"Except K K , no other actor looks comfortable ..."
3,#BlackAF (2020– ),8,Good but,I'm guessing that as a 62 year old white woman...
4,The Droving (2020),2,An honest review,Here's the truth. There's not much to this mov...
...,...,...,...,...
5571494,Sami swoi (1967),10,Funniest Polish Movie Ever,"Actually, the first three or four times I saw ..."
5571495,Sami swoi (1967),3,Horrible,This movie glorifies the worst stereotypes of ...
5571496,Le Samouraï (1967),9,Cult Movies 38,"38. LE SAMOURAI (French-action, 1967) Jef (Ala..."
5571497,Le Samouraï (1967),,"Some people seem to like this a lot, but why?","I found ""Le Samourai"" (**) to be more about st..."


In [10]:
# Sort movies by counts in descending order
movie_counts_df = reviews_df.groupby('movie').size().reset_index(name='counts').sort_values('counts',ascending=False)
movie_counts_df

Unnamed: 0,movie,counts
33463,Avengers: Endgame (2019),8771
385366,The Shawshank Redemption (1994),8236
96165,Dil Bechara (2020),7764
130559,Game of Thrones: The Iron Throne (2019) Season...,7428
62286,Captain Marvel (2019),7158
...,...,...
206176,"Local: Abandoned (2018) Season 1, Episode 1",1
206177,"Local: Enter Blue, Exit Red (2018) Season 1, E...",1
206178,"Local: The People Eater (2018) Season 1, Episo...",1
206183,Location Production Footage: The Last Temptati...,1


In [11]:
#Remove movies with fewer than 100 review
movie_counts_df = movie_counts_df[movie_counts_df['counts'] >= 100]
movie_counts_df

Unnamed: 0,movie,counts
33463,Avengers: Endgame (2019),8771
385366,The Shawshank Redemption (1994),8236
96165,Dil Bechara (2020),7764
130559,Game of Thrones: The Iron Throne (2019) Season...,7428
62286,Captain Marvel (2019),7158
...,...,...
356399,The Glimmer Man (1996),100
145619,Harley Davidson and the Marlboro Man (1991),100
130519,"Game of Thrones: Hardhome (2015) Season 5, Epi...",100
235594,My Cousin Rachel (2017),100


In [13]:
remaining_movies = movie_counts_df['movie']
reviews_df = reviews_df[reviews_df['movie'].isin(remaining_movies)].reset_index(drop=True)
reviews_df

Unnamed: 0,movie,rating,review_summary,review_detail
0,After Life (2019– ),9,Very Strong Season 2,"I enjoyed the first season, but I must say I t..."
1,The Valhalla Murders (2019– ),6,Icelandic detectives?,I know Iceland is a small country and police d...
2,Special OPS (2020– ),7,Nothing special,"Except K K , no other actor looks comfortable ..."
3,#BlackAF (2020– ),8,Good but,I'm guessing that as a 62 year old white woman...
4,All About Eve (1950),10,Amazing,Having seen this film for the first time today...
...,...,...,...,...
3094170,Quatermass and the Pit (1967),,weird,I won't go over the plot. I would have liked a...
3094171,Quatermass and the Pit (1967),8,A great Hammer film,A lot of nonsense is written about the signifi...
3094172,Le Samouraï (1967),9,Cult Movies 38,"38. LE SAMOURAI (French-action, 1967) Jef (Ala..."
3094173,Le Samouraï (1967),,"Some people seem to like this a lot, but why?","I found ""Le Samourai"" (**) to be more about st..."
