In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as pl

# Reviews Data

In [2]:
reviews = pd.read_csv('reviews.csv')
reviews

Unnamed: 0,3934,588.0,432.0,7.0999999,203461,46000
0,3405,285.0,267.0,6.4,149998,0
1,478,65.0,29.0,3.2,8465,491
2,74,83.0,25.0,7.6,7071,930
3,1254,1437.0,224.0,8.0,241030,13000
4,740,111.0,64.0,6.4,64742,0
...,...,...,...,...,...,...
4962,4801,2.0,6.0,7.0,75,121
4963,4264,514.0,488.0,7.0,181472,58000
4964,4356,85.0,119.0,6.2,29738,12000
4965,430,118.0,38.0,5.9,29591,0


In [3]:
reviews.columns = ['film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']

In [4]:
reviews.head()

Unnamed: 0,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,3405,285.0,267.0,6.4,149998,0
1,478,65.0,29.0,3.2,8465,491
2,74,83.0,25.0,7.6,7071,930
3,1254,1437.0,224.0,8.0,241030,13000
4,740,111.0,64.0,6.4,64742,0


In [5]:
reviews.isna().sum()

film_id            0
num_user          19
num_critic        46
imdb_score         0
num_votes          0
facebook_likes     0
dtype: int64

In [6]:
reviews.dropna(subset=['num_user','num_critic'],axis=0,inplace=True)

In [7]:
reviews.isna().sum()

film_id           0
num_user          0
num_critic        0
imdb_score        0
num_votes         0
facebook_likes    0
dtype: int64

# Films Data

In [8]:
films = pd.read_csv('films_original.csv')
films.head()

Unnamed: 0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,Unnamed: 5,Not Rated,Unnamed: 7,385907.0
0,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
1,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
2,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
3,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,
4,6,The Broadway Melody,1929.0,USA,100.0,English,Passed,2808000.0,379000.0


In [9]:
films.columns = ['id','title','release_year','country','duration','language','certification','gross','budget']

In [10]:
films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
1,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
2,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
3,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,
4,6,The Broadway Melody,1929.0,USA,100.0,English,Passed,2808000.0,379000.0


In [11]:
films.loc[- 1] = ['1', "Intolerance: Love's Struggle Throughout the Ages",'1916','USA','123','NaN',
                 'Not Rated','NaN','385907'] # adding a row to the dataframe
films.index = films.index + 1  # shifting index
films.sort_index(inplace=True) 

In [12]:
films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,


In [15]:
films.to_csv('films_1.csv', index=False)

In [13]:
films.isna().sum()

id                 0
title              0
release_year      42
country            2
duration          13
language          12
certification    302
gross            809
budget           430
dtype: int64

In [14]:
films.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4968 entries, 0 to 4967
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             4968 non-null   object
 1   title          4968 non-null   object
 2   release_year   4926 non-null   object
 3   country        4966 non-null   object
 4   duration       4955 non-null   object
 5   language       4956 non-null   object
 6   certification  4666 non-null   object
 7   gross          4159 non-null   object
 8   budget         4538 non-null   object
dtypes: object(9)
memory usage: 388.1+ KB


In [15]:
films.dropna(subset=['release_year','country','duration','language','certification','gross','budget'],axis=0, inplace=True)

In [16]:
films.isna().sum()

id               0
title            0
release_year     0
country          0
duration         0
language         0
certification    0
gross            0
budget           0
dtype: int64

In [38]:
#dropping NaN Values before changing data types

In [18]:
films[['release_year','duration','budget']] = films[['release_year','duration','budget']].astype(int)

In [19]:
films.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3839 entries, 0 to 4925
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             3839 non-null   object
 1   title          3839 non-null   object
 2   release_year   3839 non-null   int64 
 3   country        3839 non-null   object
 4   duration       3839 non-null   int64 
 5   language       3839 non-null   object
 6   certification  3839 non-null   object
 7   gross          3839 non-null   object
 8   budget         3839 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 299.9+ KB


# People Data

In [28]:
people = pd.read_csv('people_original.csv')
people

Unnamed: 0,1,50 Cent,1975-07-06,Unnamed: 3
0,2,A. Michael Baldwin,1963-04-04,
1,3,A. Raven Cruz,,
2,4,A.J. Buckley,1978-02-09,
3,5,A.J. DeLucia,,
4,6,A.J. Langer,1974-05-22,
...,...,...,...,...
8391,8393,Zohra Segal,1912-04-27,2014-07-10
8392,8394,Zooey Deschanel,1980-01-17,
8393,8395,Zoran Lisinac,,
8394,8396,Zubaida Sahar,,


In [29]:
people.columns = ['id','name','birthdate','deathdate']

In [30]:
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,2,A. Michael Baldwin,1963-04-04,
1,3,A. Raven Cruz,,
2,4,A.J. Buckley,1978-02-09,
3,5,A.J. DeLucia,,
4,6,A.J. Langer,1974-05-22,


In [31]:
people.loc[-1] = ['1','50 Cent','1975-07-06','']
people.index = people.index + 1
people.sort_index(inplace=True)

In [32]:
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,


In [33]:
people.isna().sum()

id              0
name            0
birthdate    2245
deathdate    7609
dtype: int64

In [34]:
people.to_csv('people_1_copy.csv', index=False)

In [35]:
people.dropna(subset=['birthdate','deathdate'],inplace=True)

In [36]:
people.isna().sum()

id           0
name         0
birthdate    0
deathdate    0
dtype: int64

In [37]:
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
6,7,Aaliyah,1979-01-16,2001-08-25
60,61,Adolphe Menjou,1890-02-18,1963-10-29
62,63,Adrian Gonzalez,1938-05-08,1998-10-23
70,71,Adriana Caselotti,1916-05-16,1997-01-19


In [38]:
people.to_csv('people_cleaned.csv', index=False)

# Roles Data

In [30]:
roles = pd.read_csv('roles.csv')
roles

Unnamed: 0,1,1.1,1630,director
0,2,1,4843,actor
1,3,1,5050,actor
2,4,1,8175,actor
3,5,2,3000,director
4,6,2,4019,actor
...,...,...,...,...
19785,19787,4966,6623,actor
19786,19788,4967,3240,actor
19787,19789,4967,4524,actor
19788,19790,4967,7886,actor


In [31]:
roles.columns = ['id','film_id','person_id','role']

In [32]:
roles

Unnamed: 0,id,film_id,person_id,role
0,2,1,4843,actor
1,3,1,5050,actor
2,4,1,8175,actor
3,5,2,3000,director
4,6,2,4019,actor
...,...,...,...,...
19785,19787,4966,6623,actor
19786,19788,4967,3240,actor
19787,19789,4967,4524,actor
19788,19790,4967,7886,actor


# Deleting columns

- del df['column_name']

- The best way to do this in Pandas is to use drop:\
df = df.drop('column_name', axis=1)\
where 1 is the axis number (0 for rows and 1 for columns.)

- Or, the drop() method accepts index/columns keywords as an alternative to specifying the axis. So we can now just do:\
df = df.drop(columns=['column_nameA', 'column_nameB'])

- To delete the column without having to reassign df you can do:\
df.drop('column_name', axis=1, inplace=True)

- Finally, to drop by column number instead of by column label, try this to delete, e.g. the 1st, 2nd and 4th columns:\
df = df.drop(df.columns[[0, 1, 3]], axis=1)  # df.columns is zero-based pd.Index


- Also working with "text" syntax for the columns:\
df.drop(['column_nameA', 'column_nameB'], axis=1, inplace=True)

- Use:\
columns = ['Col1', 'Col2', ...]\
df.drop(columns, inplace=True, axis=1)

# Drop by index
Delete first, second and fourth columns:\
df.drop(df.columns[[0,1,3]], axis=1, inplace=True)\
Delete first column:

df.drop(df.columns[[0]], axis=1, inplace=True)\
There is an optional parameter inplace so that the original data can be modified without creating a copy.