# Data Understanding

In [5]:
# Importing libraries pandas and numpy
import pandas as pd
import numpy as np

In [2]:
# Importing seaborn and matplotlib 
import seaborn as sns

import matplotlib 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

In [3]:
# Setting the size of the plots/Figures
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 8)

In [4]:
# Loading the dataset and priviewing the first 5 records

Movies = pd.read_csv("movies.csv")
Movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [6]:
#Checking the number of records and rows
Movies.shape

# 15 column and 7668 records

(7668, 15)

In [7]:
# Selecting the column names
Movies.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')

In [8]:
# Checking column information

Movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


# Data Cleaning

### Data Validation


In [84]:
#Checking Datatype sfor each column
Movies.dtypes

#Changing datatype for budget and gross columns to int

Movies['budget'] = Movies['budget'].astype('int64')

Movies['gross'] = Movies['gross'].astype('int64')

In [None]:
#Changing datatype for released to date
Movies['released'] = Movies['released'].astype

In [None]:
Movies.head()

### Completeness

In [79]:
#Cheacking for missing values
Movies.isnull().sum()

# OR
# for col in Movies.columns:
#     missing = np.mean(Movies[col].isna())
#     print(f'{col} - {missing}')
#Filling Null values in Rating column
Movies.rating.fillna('Not Rated', inplace=True)

# Fiiling Missing Values in Released column. 
#On doing research the two missimg releas dates are from 2010 
#Fill thr first
Movies.released.fillna(value='October 29, 2010 (United States)', limit=1, inplace = True)
#Second value
Movies.released.fillna(value='February 12, 2010 (United States)', limit=1, inplace = True)

# Fiiling Missing Values in Released column. 
Movies.score.fillna(Movies.score.median, inplace=True)

# Fiiling Missing Values in Votes column.
Movies.votes.fillna(Movies.votes.mean(), inplace=True)

# Fiiling Missing Values in Writer column.
Movies.writer.fillna(value='Jane Garmey', limit=1, inplace = True)
#Second value
Movies.writer.fillna(value="Beth O' Leary", limit=1, inplace = True)

#Filling miisinf values in star column
Movies.star.fillna('Ethel Merman', inplace=True)

#Filling miisinf values in country column
#All movies with missing country are from USA
Movies.country.fillna('United States', inplace=True)

#Interpolate missing values for Budget, gross 
Movies.gross.interpolate(method='linear', axis=0, inplace=True)

Movies.budget.interpolate(method='linear', axis=0, inplace=True)


Movies.budget.interpolate(method='linear', axis=0, inplace=True)

#Filling miisinf values in company and runtime column

Movies.company.fillna(method='ffill', inplace= True)

Movies.runtime.fillna(method='bfill', inplace= True)


### Consistency


In [81]:
#Checking for duplicates
Movies.duplicated().sum()

#There are no duplicated records

0

name        0
rating      0
genre       0
year        0
released    0
score       0
votes       0
director    0
writer      0
star        0
country     0
budget      0
gross       0
company     0
runtime     0
dtype: int64

0

In [69]:
Movies['company'].unique()

array(['Warner Bros.', 'Columbia Pictures', 'Lucasfilm', ...,
       'Dow Jazz Films', 'Embi Productions', 'PK 65 Films'], dtype=object)

In [66]:
Movies[Movies['country'].isna()] 

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
5728,Saw: The Final Chapter,R,Crime,2010,"October 29, 2010 (United States)",5.6,93000.0,Kevin Greutert,Patrick Melton,Tobin Bell,,,,,
5730,The Wolfman,R,Drama,2010,"October 29, 2010 (United States)",5.8,104000.0,Joe Johnston,Andrew Kevin Walker,Benicio Del Toro,,,,,
7615,Clinton Road,Not Rated,Horror,2019,"June 14, 2019 (United States)",5.9,1900.0,Richard Grieco,Noel Ashman,Ace Young,,2500000.0,50400.0,Growing Tree Productions,77.0
