In [2]:
import requests
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## CLEANING IMDb_Movies 

#### Read in IMDb_Movies Table

In [7]:
movie_df = pd.read_csv("../Data/Old_Dirty_Data/IMDb_Movies.csv")

In [31]:
movie_df.head(10)

Unnamed: 0,Title_ID,Title,Director,Rating,Genre,Runtime,Release_Year,Award_Noms,Award_Wins,Score_ID
0,17761,The Shawshank Redemption,Frank Darabont,R,Drama,142,1994,43,21,7041
1,17762,The Godfather,Francis Ford Coppola,R,"Crime, Drama",175,1972,30,32,7042
2,17763,The Dark Knight,Christopher Nolan,PG-13,"Action, Crime, Drama",152,2008,163,160,7043
3,17764,The Godfather Part II,Francis Ford Coppola,R,"Crime, Drama",202,1974,21,17,7044
4,17765,12 Angry Men,Sidney Lumet,Approved,"Crime, Drama",96,1957,13,17,7045
5,17766,Schindler's List,Steven Spielberg,R,"Biography, Drama, History",195,1993,49,91,7046
6,17767,The Lord of the Rings: The Return of the King,Peter Jackson,PG-13,"Action, Adventure, Drama",201,2003,124,209,7047
7,17768,Pulp Fiction,Quentin Tarantino,R,"Crime, Drama",154,1994,75,70,7048
8,17769,The Lord of the Rings: The Fellowship of the Ring,Peter Jackson,PG-13,"Action, Adventure, Drama",178,2001,126,121,7049
9,177610,"The Good, the Bad and the Ugly",Sergio Leone,R,"Adventure, Western",178,1966,6,3,70410


##### Drop 'Unnamed: 0' which I believe is the index in the csv being imported in

In [13]:
movie_df.pop("Unnamed: 0")

0        0
1        1
2        2
3        3
4        4
      ... 
245    245
246    246
247    247
248    248
249    249
Name: Unnamed: 0, Length: 250, dtype: int64

In [15]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title_ID      250 non-null    int64 
 1   Title         250 non-null    object
 2   Director      250 non-null    object
 3   Rating        250 non-null    object
 4   Genre         250 non-null    object
 5   Runtime       250 non-null    int64 
 6   Release_Year  250 non-null    int64 
 7   Award_Noms    250 non-null    int64 
 8   Award_Wins    250 non-null    int64 
 9   Score_ID      250 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 19.7+ KB


#### Next, I need to clean up Ratings and the discrepencies there

Source: Motion Picture Association (MPA)
Link: https://www.motionpictures.org/who-we-are/#our-history
Summary: Founded in 1922, MPA started with the Hays Code system which which simply approved or disapproved based on whether the movies were deemed "moral" or "immoral". In 1968, the Hayes Code was replaced with the modern rating system we have today.
    
Source: IMDb
Link: https://help.imdb.com/article/contribution/titles/certificates/GU757M8ZJ9ZPXB39#
Summary: This website goes through each country and explains the ratings 

From the research that I have conducted, I have decided that this will be the standard to limit the discrepencies in the rating categories.<br>
<bold> DECISION: <bold> Passed and Approved will be combined to be "Approved". Furthermore, Unrated and Not Rated show very little difference in the research and typically does not follow any standard for why one is used over another; thus, these will be combined as "Not Rated"

In [22]:
movie_df['Rating'].value_counts()

R                        101
PG                        37
PG-13                     34
Not Rated                 24
G                         19
Passed                    16
Approved                  13
Not Available on IMDb      1
TV-PG                      1
Unrated                    1
X                          1
TV-MA                      1
GP                         1
Name: Rating, dtype: int64

In [34]:
movie_df['Rating'] = movie_df['Rating'].replace({'GP':'PG', 'Unrated': 'Not Rated', 'Passed':'Approved'})

In [35]:
movie_df['Rating'].value_counts()

R                        101
PG                        38
PG-13                     34
Approved                  29
Not Rated                 25
G                         19
Not Available on IMDb      1
TV-PG                      1
X                          1
TV-MA                      1
Name: Rating, dtype: int64

#### Genre has some misplaced commas between "-" 

In [41]:
movie_df['Genre'] = [x.replace("-, ","-") for x in movie_df['Genre']]

In [43]:
movie_df['Genre'].value_counts()

Drama                           19
Crime, Drama                    14
Animation, Adventure, Comedy    10
Biography, Drama, History        9
Crime, Drama, Mystery            9
                                ..
Action, Thriller                 1
Drama, Mystery, Sci-Fi           1
Comedy, Drama, Family            1
Comedy, Music, Romance           1
Comedy                           1
Name: Genre, Length: 104, dtype: int64

## CLEANING IMDb_Score

#### Read in IMDb_Score Table

In [11]:
score_df = pd.read_csv("../Data/Old_Dirty_Data/IMDb_Score.csv")

In [12]:
score_df.head()

Unnamed: 0.1,Unnamed: 0,Score_ID,IMDb_Score,Audience_Reviews,US_Users,Non_US_Users,Male_Reviews,Female_Reviews,Male_Score,Female_Score
0,0,7041,9.3,2666904,566032,1672983,1475251,294889,9.3,9.2
1,1,7042,9.2,1848055,417566,1161734,1072921,166441,9.2,8.9
2,2,7043,9.0,2639814,621027,1646769,1490958,267216,9.0,8.7
3,3,7044,9.0,1265636,282291,805848,759883,105690,9.0,8.7
4,4,7045,9.0,787589,187528,509017,453568,76860,9.0,8.9


Drop first column

In [16]:
score_df.pop("Unnamed: 0")

0        0
1        1
2        2
3        3
4        4
      ... 
245    245
246    246
247    247
248    248
249    249
Name: Unnamed: 0, Length: 250, dtype: int64

In [17]:
score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Score_ID          250 non-null    int64  
 1   IMDb_Score        250 non-null    float64
 2   Audience_Reviews  250 non-null    int64  
 3   US_Users          250 non-null    int64  
 4   Non_US_Users      250 non-null    int64  
 5   Male_Reviews      250 non-null    int64  
 6   Female_Reviews    250 non-null    int64  
 7   Male_Score        250 non-null    float64
 8   Female_Score      250 non-null    float64
dtypes: float64(3), int64(6)
memory usage: 17.7 KB


## Cleaning RT Table

#### Read in Rotten_Tomatoes Table

In [52]:
rt_df = pd.read_csv('../Data/Old_Dirty_Data/RT.csv')

In [47]:
rt_df

Unnamed: 0.1,Unnamed: 0,Title_ID,Tomatometer,Audience_Score,Critic_Reviews,Audience_Reviews,US_Box_Office,Distributor,Original_Language
0,0,17761,91.0,98.0,82,"250,000+",$27.3M,Columbia Pictures,English
1,1,17762,97.0,98.0,149,"250,000+",$134.8M,Paramount Pictures,English
2,2,17763,94.0,94.0,345,"250,000+",$2.0M,Warner Bros.,English
3,3,17764,96.0,97.0,123,"250,000+",Not Available,Paramount Pictures,English
4,4,17765,100.0,97.0,58,"100,000+",Not Available,"Criterion Collection, ...",English
...,...,...,...,...,...,...,...,...,...
245,245,1776246,73.0,94.0,15,"5,000+",Not Available,New World Pictures,Russian
246,246,1776247,57.0,94.0,383,"50,000+ Verified",$355.6M,Walt Disney,English
247,247,1776248,76.0,89.0,232,"100,000+",$169.7M,Walt Disney,English
248,248,1776249,96.0,90.0,143,"100,000+",Not Available,Warner Bros. Pictures,English


In [50]:
rt_df.pop("Unnamed: 0")

0        0
1        1
2        2
3        3
4        4
      ... 
245    245
246    246
247    247
248    248
249    249
Name: Unnamed: 0, Length: 250, dtype: int64

In [51]:
rt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title_ID           250 non-null    int64  
 1   Tomatometer        224 non-null    float64
 2   Audience_Score     230 non-null    float64
 3   Critic_Reviews     250 non-null    int64  
 4   Audience_Reviews   250 non-null    object 
 5   US_Box_Office      250 non-null    object 
 6   Distributor        250 non-null    object 
 7   Original_Language  250 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 15.8+ KB


#### clean audience reviews (ex. 250,000 + to 250K+)

In [54]:
rt_df['Audience_Reviews'].value_counts()

250,000+            97
100,000+            31
50,000+             28
25,000+             27
10,000+             18
0                   14
5,000+              11
Fewer than 50        9
250+                 3
1,000+               3
50,000+ Verified     2
25,000+ Verified     2
100+                 1
2,500+               1
10,000+ Verified     1
500+                 1
2,500+ Verified      1
Name: Audience_Reviews, dtype: int64

In [58]:
rt_df['Audience_Reviews'] = [x.replace(",000+","k+").replace("Verified","").replace(",500+",".5k+") for x in rt_df['Audience_Reviews']]

In [59]:
rt_df['Audience_Reviews'].value_counts()

250k+            97
100k+            31
50k+             28
25k+             27
10k+             18
0                14
5k+              11
Fewer than 50     9
250+              3
1k+               3
50k+              2
25k+              2
100+              1
2,500+            1
10k+              1
500+              1
2,500+            1
Name: Audience_Reviews, dtype: int64

#### Clean Distributor to remove spaces for movies with multiple distributors

### with cleaned up tables, I need to save them

#### Saving as CSV

In [None]:
movie_df.to_csv('../Data/IMDb_Movies.csv')

In [None]:
score_df.to_csv('../Data/IMDb_Score.csv')

In [None]:
rt_df.to_csv('../Data/Rotten_Tomatoes.csv')