Experiment - 7

In [1]:
# Data preprocessing using Pandas.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

from google.colab import files

uploaded = files.upload()

Saving movie_metadata.csv to movie_metadata.csv


In [12]:
data = pd.read_csv(io.BytesIO(uploaded['movie_metadata.csv']))
print(data)

      color      director_name  num_critic_for_reviews  duration  \
0     Color      James Cameron                   723.0     178.0   
1     Color     Gore Verbinski                   302.0     169.0   
2     Color         Sam Mendes                   602.0     148.0   
3     Color  Christopher Nolan                   813.0     164.0   
4       NaN        Doug Walker                     NaN       NaN   
...     ...                ...                     ...       ...   
5038  Color        Scott Smith                     1.0      87.0   
5039  Color                NaN                    43.0      43.0   
5040  Color   Benjamin Roberds                    13.0      76.0   
5041  Color        Daniel Hsia                    14.0     100.0   
5042  Color           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                       563.0    

In [13]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [14]:
data['country'] = data['country'].fillna("")

# With numerical data like the duration of the movie, a calculation like taking the mean
# duration can help us even the dataset out.
data['duration'] = data['duration'].fillna(data['duration'].mean())

##### Remove incomplete rows #####
# Dropping all rows with any NA values
data.dropna()

# Here we put a limitation on how many non-null values need to be in a row in order to keep it.
# So, the data needs to have at least 5 non-null values
data.dropna(thresh=5)

##### Deal with error-prone columns #####
# Drop the columns that are all NA values
data.dropna(axis=1, how='all')

##### Normalize data types #####
# First, fill NaN values in the 'duration' column with the mean, and then convert it to an integer
data['duration'] = data['duration'].fillna(data['duration'].mean()).astype(int)

##### Change casing #####
# To change all our movie titles to uppercase:
data['movie_title'] = data['movie_title'].str.upper()

##### Rename columns #####
# Rename 'title_year' to 'release_date' and 'movie_facebook_likes' to simply 'facebook_likes'
# and save it in the dataframe
data = data.rename(columns={'title_year': 'release_date', 'movie_facebook_likes': 'facebook_likes'})

# Output the updated dataframe
print(data.head())

   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0       178   
1  Color     Gore Verbinski                   302.0       169   
2  Color         Sam Mendes                   602.0       148   
3  Color  Christopher Nolan                   813.0       164   
4    NaN        Doug Walker                     NaN       107   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                  22000.0                 23000.0    Christian Bale   
4                    131.0                     NaN        Rob Walker   

   actor_1_facebook_likes        gross                           genres  ...  \
0                  1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi  ...   
1               

In [15]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,release_date,actor_2_facebook_likes,imdb_score,aspect_ratio,facebook_likes
0,Color,James Cameron,723.0,178,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,107,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
