In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import ast
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set()

## importing raw data

In [3]:
raw_data = pd.read_csv('movies.csv')

## Rename working data

In [4]:
data = raw_data.copy()
data.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


## Checking data information

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


In [6]:
data.describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


## checking for null values

In [7]:
data.isnull().sum()

MOVIES         0
YEAR         644
GENRE         80
RATING      1820
ONE-LINE       0
STARS          0
VOTES       1820
RunTime     2958
Gross       9539
dtype: int64

# Cleaning data

In [8]:
data = data.drop_duplicates(subset=['MOVIES'])
data

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
...,...,...,...,...,...,...,...,...,...
9935,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,
9964,AlRawabi School for Girls,(2021),\nDrama,,\nAdd a Plot\n,\n Director:\nTima Shomali\n| \n Stars:\...,,,
9993,Totenfrau,(2022– ),"\nDrama, Thriller",,\nAdd a Plot\n,\n Director:\nNicolai Rohde\n| \n Stars:...,,,
9995,Arcane,(2021– ),"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,


## Cleaning 'YEAR' column

In [9]:
data["YEAR"] = data["YEAR"].str.replace('[^a-zA-Z0-9]', '')
data["YEAR"]=data["YEAR"].apply(lambda x : str(x))
data["YEAR"]=data["YEAR"].apply(lambda x : (x[0:4] + ", " + x[4:8]) if len(x)>4 else x)
data.drop_duplicates()
data.dropna(subset = 'YEAR' )
data

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,2021,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,"2010, 2022","\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,
3,Rick and Morty,2013,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
...,...,...,...,...,...,...,...,...,...
9935,The Imperfects,2021,"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,
9964,AlRawabi School for Girls,2021,\nDrama,,\nAdd a Plot\n,\n Director:\nTima Shomali\n| \n Stars:\...,,,
9993,Totenfrau,2022,"\nDrama, Thriller",,\nAdd a Plot\n,\n Director:\nNicolai Rohde\n| \n Stars:...,,,
9995,Arcane,2021,"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,


## Cleaning '\n' from data

In [10]:
data['GENRE'] = data['GENRE'].str.strip('\n')
data['ONE-LINE'] = data['ONE-LINE'].str.strip('\n')
data

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,2021,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,"2010, 2022","Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,
3,Rick and Morty,2013,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,
4,Army of Thieves,2021,"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
...,...,...,...,...,...,...,...,...,...
9935,The Imperfects,2021,"Adventure, Drama, Fantasy",,Add a Plot,\n \n Stars:\nMorgan Taylor Camp...,,,
9964,AlRawabi School for Girls,2021,Drama,,Add a Plot,\n Director:\nTima Shomali\n| \n Stars:\...,,,
9993,Totenfrau,2022,"Drama, Thriller",,Add a Plot,\n Director:\nNicolai Rohde\n| \n Stars:...,,,
9995,Arcane,2021,"Animation, Action, Adventure",,Add a Plot,\n,,,


In [11]:
data.isnull().sum()

MOVIES         0
YEAR           0
GENRE         71
RATING      1002
ONE-LINE       0
STARS          0
VOTES       1002
RunTime     1512
Gross       6365
dtype: int64

## Spliting stars column into directors and stars

In [12]:
data[['directors', 'stars']] = data['STARS'].str.split('Stars:', expand= True)
data.drop(columns='STARS', inplace=True)
data

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,VOTES,RunTime,Gross,directors,stars
0,Blood Red Sky,2021,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,21062,121.0,,\n Director:\nPeter Thorwarth\n| \n,"\nPeri Baumeister, \nCarl Anton Koch, \nAlexan..."
1,Masters of the Universe: Revelation,2021,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,17870,25.0,,\n \n,"\nChris Wood, \nSarah Michelle Gellar, \nLena ..."
2,The Walking Dead,"2010, 2022","Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,885805,44.0,,\n \n,"\nAndrew Lincoln, \nNorman Reedus, \nMelissa M..."
3,Rick and Morty,2013,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,414849,23.0,,\n \n,"\nJustin Roiland, \nChris Parnell, \nSpencer G..."
4,Army of Thieves,2021,"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",,,,\n Director:\nMatthias Schweighöfer\n| \n,"\nMatthias Schweighöfer, \nNathalie Emmanuel, ..."
...,...,...,...,...,...,...,...,...,...,...
9935,The Imperfects,2021,"Adventure, Drama, Fantasy",,Add a Plot,,,,\n \n,"\nMorgan Taylor Campbell, \nChris Cope, \nIñak..."
9964,AlRawabi School for Girls,2021,Drama,,Add a Plot,,,,\n Director:\nTima Shomali\n| \n,"\nSalsabiela A., \nJoanna Arida, \nYara Mustaf..."
9993,Totenfrau,2022,"Drama, Thriller",,Add a Plot,,,,\n Director:\nNicolai Rohde\n| \n,"\nFelix Klare, \nRomina Küper, \nAnna Maria Mü..."
9995,Arcane,2021,"Animation, Action, Adventure",,Add a Plot,,,,\n,


## Cleaning directors and star columns

In [13]:
data['directors'] = data['directors'].str.split(':', expand=True)
data.replace(r'\n', '', regex=True)
data["directors"] = data["directors"].str.replace('[^a-zA-Z0-9 ]','')
data["stars"] = data["stars"].str.replace('[^a-zA-Z0-9 ]','')
data

ValueError: Columns must be same length as key

## Movies with Gross data as mwg

In [None]:
mwg = data[data['Gross'].notnull()].reset_index(drop=True)
mwg['Gross'] = mwg['Gross'].str.replace('[^0-9]', '')
mwg

## Droping null values

In [None]:
data = data.dropna(subset = ['VOTES','RATING','GENRE'])

## Change Gross data type to float

In [None]:
mwg['Gross'] = mwg['Gross'].astype('float')
mwg.info()

In [None]:
genres = data["GENRE"].value_counts().head(10).plot(kind='bar')
plt.ylabel('Count')
plt.legend(labels ='Count')
plt.show()

In [14]:
data

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,VOTES,RunTime,Gross,directors,stars
0,Blood Red Sky,2021,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,21062,121.0,,\n Director:\nPeter Thorwarth\n| \n,"\nPeri Baumeister, \nCarl Anton Koch, \nAlexan..."
1,Masters of the Universe: Revelation,2021,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,17870,25.0,,\n \n,"\nChris Wood, \nSarah Michelle Gellar, \nLena ..."
2,The Walking Dead,"2010, 2022","Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,885805,44.0,,\n \n,"\nAndrew Lincoln, \nNorman Reedus, \nMelissa M..."
3,Rick and Morty,2013,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,414849,23.0,,\n \n,"\nJustin Roiland, \nChris Parnell, \nSpencer G..."
4,Army of Thieves,2021,"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",,,,\n Director:\nMatthias Schweighöfer\n| \n,"\nMatthias Schweighöfer, \nNathalie Emmanuel, ..."
...,...,...,...,...,...,...,...,...,...,...
9935,The Imperfects,2021,"Adventure, Drama, Fantasy",,Add a Plot,,,,\n \n,"\nMorgan Taylor Campbell, \nChris Cope, \nIñak..."
9964,AlRawabi School for Girls,2021,Drama,,Add a Plot,,,,\n Director:\nTima Shomali\n| \n,"\nSalsabiela A., \nJoanna Arida, \nYara Mustaf..."
9993,Totenfrau,2022,"Drama, Thriller",,Add a Plot,,,,\n Director:\nNicolai Rohde\n| \n,"\nFelix Klare, \nRomina Küper, \nAnna Maria Mü..."
9995,Arcane,2021,"Animation, Action, Adventure",,Add a Plot,,,,\n,
