# Import of Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('rt.movie_info.tsv.gz', sep='\t')
df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [4]:
df2 = pd.read_csv('rt.reviews.tsv.gz', sep='\t', encoding='latin-1')
df2.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


# Cleaning Columns & Feature Engineering

I create a `release_year` feature to aggregate values by year in my analysis.

In [6]:
df["theater_date"] = pd.to_datetime(df["theater_date"])

In [7]:
df["release_year"] = df["theater_date"].dt.to_period('Y')
df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,release_year
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,1971-10-09,"Sep 25, 2001",,,104 minutes,,1971
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,"Jan 1, 2013",$,600000.0,108 minutes,Entertainment One,2012
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,1996-09-13,"Apr 18, 2000",,,116 minutes,,1996
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,1994-12-09,"Aug 27, 1997",,,128 minutes,,1994
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,NaT,,,,200 minutes,,NaT


In [8]:
len(pd.unique(df2['id']))

1135

I create an averaged `review_score` for further analysis into possible correlation between it and `box_office` revenue.

In [9]:
df2["fresh"].unique()

array(['fresh', 'rotten'], dtype=object)

In [10]:
conditions = [(df2["fresh"] == 'rotten'), (df2["fresh"] == 'fresh')]
values = ['0', '100']

df2['review_score'] = np.select(conditions, values)
df2.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date,review_score
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018",100
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018",0
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018",100
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017",100
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017",100


In [11]:
df2["review_score"] = df2["review_score"].astype(int)

In [12]:
avg_review_df = df2.groupby("id")["review_score"].mean()
avg_review_df.to_frame()

Unnamed: 0_level_0,review_score
id,Unnamed: 1_level_1
3,63.190184
5,78.260870
6,56.140351
8,74.666667
10,46.296296
...,...
1996,67.132867
1997,35.714286
1998,100.000000
1999,58.695652


# Merge

In [13]:
df_final = pd.merge(df, avg_review_df, on=["id"])
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1135 entries, 0 to 1134
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            1135 non-null   int64         
 1   synopsis      1118 non-null   object        
 2   rating        1134 non-null   object        
 3   genre         1133 non-null   object        
 4   director      1014 non-null   object        
 5   writer        891 non-null    object        
 6   theater_date  996 non-null    datetime64[ns]
 7   dvd_date      996 non-null    object        
 8   currency      299 non-null    object        
 9   box_office    299 non-null    object        
 10  runtime       1123 non-null   object        
 11  studio        415 non-null    object        
 12  release_year  996 non-null    period[A-DEC] 
 13  review_score  1135 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(10), period[A-DEC](1)
memory usage: 1

In [14]:
df_final["review_score"] = df_final["review_score"].astype(int)

To enable analysis into the different genres, I split and explode these values into their own rows.

In [15]:
df_final["genre"] = df_final["genre"].str.split(pat = '|')
df_final = df_final.explode("genre")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            1560 non-null   int64         
 1   synopsis      1498 non-null   object        
 2   rating        1557 non-null   object        
 3   genre         1552 non-null   object        
 4   director      1361 non-null   object        
 5   writer        1111 non-null   object        
 6   theater_date  1201 non-null   datetime64[ns]
 7   dvd_date      1201 non-null   object        
 8   currency      340 non-null    object        
 9   box_office    340 non-null    object        
 10  runtime       1530 non-null   object        
 11  studio        494 non-null    object        
 12  release_year  1201 non-null   period[A-DEC] 
dtypes: datetime64[ns](1), int64(1), object(10), period[A-DEC](1)
memory usage: 158.6+ KB


# Removing Interefering Nulls

Filling the null `box_office` values would skew data too far, so they are removed.

In [16]:
df_final = df_final.dropna(subset=["box_office"])
df_final["box_office"] = df_final["box_office"].str.replace(',', '').astype(float)

In [17]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 1130
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            637 non-null    int64         
 1   synopsis      637 non-null    object        
 2   rating        637 non-null    object        
 3   genre         637 non-null    object        
 4   director      547 non-null    object        
 5   writer        498 non-null    object        
 6   theater_date  633 non-null    datetime64[ns]
 7   dvd_date      633 non-null    object        
 8   currency      637 non-null    object        
 9   box_office    637 non-null    float64       
 10  runtime       635 non-null    object        
 11  studio        562 non-null    object        
 12  release_year  633 non-null    period[A-DEC] 
 13  review_score  637 non-null    int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(9), period[A-DEC](1)
memory 

In [18]:
df_final.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,release_year,review_score
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,"Jan 1, 2013",$,600000.0,108 minutes,Entertainment One,2012,63
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,"Jan 1, 2013",$,600000.0,108 minutes,Entertainment One,2012,63
4,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,2002-01-11,"Jun 18, 2002",$,41032915.0,82 minutes,Paramount Pictures,2002,46
5,13,"Stewart Kane, an Irishman living in the Austra...",R,Drama,Ray Lawrence,Raymond Carver|Beatrix Christian,2006-04-27,"Oct 2, 2007",$,224114.0,123 minutes,Sony Pictures Classics,2006,64
6,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,2010-06-30,"Nov 9, 2010",$,134904.0,117 minutes,,2010,14


In [19]:
df_final.to_csv('re_cleaned.csv')