In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading the .tsv file by seperating the deliminator, then previewing the data
df = pd.read_csv('rt.movie_info.tsv.gz', sep='\t')
df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [3]:
# with this file, we need to seperate and also read in the correct encoding.
# We can see that there are several reviews for each movie id
df2 = pd.read_csv('rt.reviews.tsv.gz', sep='\t', encoding='latin-1')
df2.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [4]:
# a majority of box_office values missing
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [6]:
# let's see how many unique movies are actually in this database
len(pd.unique(df2['id']))

1135

In [7]:
# merging the dataframes and connecting them via matching movie "id"
df_merge = pd.merge(df, df2, on=["id"])
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54432 entries, 0 to 54431
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54432 non-null  int64 
 1   synopsis      54300 non-null  object
 2   rating_x      54346 non-null  object
 3   genre         54345 non-null  object
 4   director      48992 non-null  object
 5   writer        45206 non-null  object
 6   theater_date  53206 non-null  object
 7   dvd_date      53206 non-null  object
 8   currency      33310 non-null  object
 9   box_office    33310 non-null  object
 10  runtime       53594 non-null  object
 11  studio        40125 non-null  object
 12  review        48869 non-null  object
 13  rating_y      40915 non-null  object
 14  fresh         54432 non-null  object
 15  critic        51710 non-null  object
 16  top_critic    54432 non-null  int64 
 17  publisher     54123 non-null  object
 18  date          54432 non-null  object
dtypes: i

In [8]:
# Because keeping the null values of box_office would ultimately interfere with our correlation outputs, we should remove them
# replacing the null with the columns median or mode value would skew our data completely
df_merge = df_merge.dropna(subset=["box_office"])

In [9]:
# next we need to turn the "fresh" column into values we can analyse properly
# first we find out the unique values
df_merge["fresh"].unique()

array(['fresh', 'rotten'], dtype=object)

In [10]:
# rotten is something with a < 59% review score, and fresh has a > 60% review score
# Although it is a small margin, this is what we are going to define as something disliked vs liked
# We assign everything that is rotten as 0, and everything fresh as 100 and put those values in a new column
# This will help us get an average review percentage
conditions = [(df_merge["fresh"] == 'rotten'), (df_merge["fresh"] == 'fresh')]
values = ['0', '100']

df_merge['review_score'] = np.select(conditions, values)
df_merge.head()

Unnamed: 0,id,synopsis,rating_x,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,review,rating_y,fresh,critic,top_critic,publisher,date,review_score
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018",100
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018",0
2,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018",100
3,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017",100
4,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017",100


In [11]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33310 entries, 0 to 54317
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            33310 non-null  int64 
 1   synopsis      33310 non-null  object
 2   rating_x      33310 non-null  object
 3   genre         33310 non-null  object
 4   director      29368 non-null  object
 5   writer        27634 non-null  object
 6   theater_date  33199 non-null  object
 7   dvd_date      33199 non-null  object
 8   currency      33310 non-null  object
 9   box_office    33310 non-null  object
 10  runtime       33128 non-null  object
 11  studio        30967 non-null  object
 12  review        32157 non-null  object
 13  rating_y      25538 non-null  object
 14  fresh         33310 non-null  object
 15  critic        32085 non-null  object
 16  top_critic    33310 non-null  int64 
 17  publisher     33104 non-null  object
 18  date          33310 non-null  object
 19  revi

In [12]:
# change the review score column into an integer for the next step
df_merge["review_score"] = df_merge["review_score"].astype(int)
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33310 entries, 0 to 54317
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            33310 non-null  int64 
 1   synopsis      33310 non-null  object
 2   rating_x      33310 non-null  object
 3   genre         33310 non-null  object
 4   director      29368 non-null  object
 5   writer        27634 non-null  object
 6   theater_date  33199 non-null  object
 7   dvd_date      33199 non-null  object
 8   currency      33310 non-null  object
 9   box_office    33310 non-null  object
 10  runtime       33128 non-null  object
 11  studio        30967 non-null  object
 12  review        32157 non-null  object
 13  rating_y      25538 non-null  object
 14  fresh         33310 non-null  object
 15  critic        32085 non-null  object
 16  top_critic    33310 non-null  int64 
 17  publisher     33104 non-null  object
 18  date          33310 non-null  object
 19  revi

In [13]:
# then we can turn these 'scores' into an average score percentage per each unique movie id
df_merge.groupby(["id"])["review_score"].mean()

id
3       63.190184
10      46.296296
13      64.210526
14      14.285714
23      91.416309
          ...    
1976    96.153846
1980    81.764706
1981    75.000000
1986    97.752809
1996    67.132867
Name: review_score, Length: 299, dtype: float64

In [14]:
# with this done, we can make a new dataframe of all the columns that will be necessary for our analysis,
# grouped by the unique movie id's
df_final = df_merge.groupby(["id", "genre", "box_office", "theater_date"])["review_score"].mean().to_frame(name = "review_percent").reset_index()
df_final.head()

Unnamed: 0,id,genre,box_office,theater_date,review_percent
0,3,Drama|Science Fiction and Fantasy,600000,"Aug 17, 2012",63.190184
1,10,Comedy,41032915,"Jan 11, 2002",46.296296
2,13,Drama,224114,"Apr 27, 2006",64.210526
3,14,Drama,134904,"Jun 30, 2010",14.285714
4,23,Drama,99165609,"Dec 20, 2013",91.416309


In [15]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              296 non-null    int64  
 1   genre           296 non-null    object 
 2   box_office      296 non-null    object 
 3   theater_date    296 non-null    object 
 4   review_percent  296 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ KB


In [16]:
# as you can see, we need to change the type of theatre_date to datetime and review_percent to int
df_final['theater_date'] = pd.to_datetime(df_final['theater_date'])
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              296 non-null    int64         
 1   genre           296 non-null    object        
 2   box_office      296 non-null    object        
 3   theater_date    296 non-null    datetime64[ns]
 4   review_percent  296 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 11.7+ KB


In [17]:
df_final["review_percent"] = df_final["review_percent"].astype(int)

In [18]:
df_final.head()

Unnamed: 0,id,genre,box_office,theater_date,review_percent
0,3,Drama|Science Fiction and Fantasy,600000,2012-08-17,63
1,10,Comedy,41032915,2002-01-11,46
2,13,Drama,224114,2006-04-27,64
3,14,Drama,134904,2010-06-30,14
4,23,Drama,99165609,2013-12-20,91


In [19]:
# The last and final clean up step we need to do is to seperate the strings of genres into their own values
# Because the string values are connected by '|', we have way more unique values than there actually are
df_final["genre"].describe()

count       296
unique       88
top       Drama
freq         40
Name: genre, dtype: object

In [20]:
df_final["genre"].str.split('|')


0                   [Drama, Science Fiction and Fantasy]
1                                               [Comedy]
2                                                [Drama]
3                                                [Drama]
4                                                [Drama]
                             ...                        
291    [Action and Adventure, Art House and Internati...
292    [Action and Adventure, Science Fiction and Fan...
293                                      [Comedy, Drama]
294         [Art House and International, Comedy, Drama]
295    [Action and Adventure, Horror, Mystery and Sus...
Name: genre, Length: 296, dtype: object

In [None]:
df_final[["genre_1", "genre_2", "genre_3", "genre_4", "genre_5"]] = df_final["genre"].str.split(pat = '|', expand = True)
df_final.head()

In [None]:
df_final.drop("genre", axis=1, inplace=True)
df_final.head()

In [None]:
df_final.describe()