In [69]:
# import pandas library
import pandas as pd

In [70]:
# read the csv file
df = pd.read_csv('IMDB.csv', delimiter=';')

Step 1 - Understanding the data

In [71]:
# shape of csv
# shows no.rows and no.column
df.shape

(101, 12)

In [72]:
# display first 5 rows of dataset
df.head()

Unnamed: 0,IMBD title ID,Original title,Release year,Genre,Duration,Country,Content Rating,Director,Unnamed: 8,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,,$ 222831817,1.780.147,"8,9f"


In [73]:
# display all columns
df.columns

Index(['IMBD title ID', 'Original title', 'Release year', 'Genre', 'Duration',
       'Country', 'Content Rating', 'Director', 'Unnamed: 8', 'Income',
       ' Votes ', 'Score'],
      dtype='object')

In [74]:
# list data type for each column
df.dtypes

IMBD title ID      object
Original title     object
Release year       object
Genre              object
Duration           object
Country            object
Content Rating     object
Director           object
Unnamed: 8        float64
Income             object
 Votes             object
Score              object
dtype: object

Step 2 - Transform the data

In [75]:
df

Unnamed: 0,IMBD title ID,Original title,Release year,Genre,Duration,Country,Content Rating,Director,Unnamed: 8,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142,USA,R,Frank Darabont,,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175,USA,R,Francis Ford Coppola,,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152,US,PG-13,Christopher Nolan,,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220,USA,R,Francis Ford Coppola,,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,,$ 222831817,1.780.147,"8,9f"
...,...,...,...,...,...,...,...,...,...,...,...,...
96,tt0070735,The Sting,1974-03-21,"Comedy, Crime, Drama",129,USA,PG,George Roy Hill,,$ 156000000,236.285,7.5
97,tt0082096,Das Boot,1982-03-18,"Adventure, Drama, Thriller",149,West Germany,R,Wolfgang Petersen,,$ 11487676,226.427,7.5
98,tt0059578,Per qualche dollaro in più,1965-12-20,Western,132,Italy,,Sergio Leone,,$ 15000000,226.039,7.4
99,tt1832382,Jodaeiye Nader az Simin,2011-10-21,Drama,123,Iran,PG-13,Asghar Farhadi,,$ 22926076,214.165,7.4


In [76]:
# check how many null values in each column
df.isna().sum()

IMBD title ID       1
Original title      1
Release year        1
Genre               1
Duration            2
Country             1
Content Rating     24
Director            1
Unnamed: 8        101
Income              1
 Votes              1
Score               1
dtype: int64

In [77]:
# drop columns which only have null values
df = df.dropna(axis=1, how='all')
# Show all columns after dropping empty columns
df.columns

Index(['IMBD title ID', 'Original title', 'Release year', 'Genre', 'Duration',
       'Country', 'Content Rating', 'Director', 'Income', ' Votes ', 'Score'],
      dtype='object')

In [78]:
# drop rows which only have null values
df = df.dropna(axis=0, how='all')

In [79]:
df.isna().sum()

IMBD title ID      0
Original title     0
Release year       0
Genre              0
Duration           1
Country            0
Content Rating    23
Director           0
Income             0
 Votes             0
Score              0
dtype: int64

In [80]:
null_duration_rows = df[df['Duration'].isna()]

# Display the row number(s) where 'Duration' is null
print(null_duration_rows.index)

Index([14], dtype='int64')


In [81]:
# get data on row 14
df.loc[14]

IMBD title ID                           tt0133093
Original title                         The Matrix
Release year                           1999-05-07
Genre                              Action, Sci-Fi
Duration                                      NaN
Country                                       USA
Content Rating                                  R
Director          Lana Wachowski, Lilly Wachowski
Income                                $ 465718588
 Votes                                  1.632.315
Score                                       ++8.7
Name: 14, dtype: object

In [82]:
# manually insert duration value (acccessed imdb website)
df.loc[14,'Duration'] = 136
df.loc[14]

IMBD title ID                           tt0133093
Original title                         The Matrix
Release year                           1999-05-07
Genre                              Action, Sci-Fi
Duration                                      136
Country                                       USA
Content Rating                                  R
Director          Lana Wachowski, Lilly Wachowski
Income                                $ 465718588
 Votes                                  1.632.315
Score                                       ++8.7
Name: 14, dtype: object

In [83]:
# check to see if null values are gone
df.isna().sum()

IMBD title ID      0
Original title     0
Release year       0
Genre              0
Duration           0
Country            0
Content Rating    23
Director           0
Income             0
 Votes             0
Score              0
dtype: int64

In [84]:
df.head(10)

Unnamed: 0,IMBD title ID,Original title,Release year,Genre,Duration,Country,Content Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220,USA,R,Francis Ford Coppola,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,$ 222831817,1.780.147,"8,9f"
5,tt0167260,The Lord of the Rings: The Return of the King,22 Feb 04,"Action, Adventure, Drama",201,New Zealand,PG-13,Peter Jackson,$ 1142271098,1.604.280,08.9
6,tt0108052,Schindler's List,1994-03-11,"Biography, Drama, History",Nan,USA,R,Steven Spielberg,$ 322287794,1.183.248,8.9
7,tt0050083,12 Angry Men,1957-09-04,"Crime, Drama",96,USA,Not Rated,Sidney Lumet,$ 576,668.473,8.9
8,tt1375666,Inception,2010-09-24,"Action, Adventure, Sci-Fi",148,USA,PG-13,Christopher Nolan,$ 869784991,2.002.816,8..8
9,tt0137523,Fight Club,10-29-99,Drama,Inf,UK,R,David Fincher,$ 101218804,1.807.440,8.8


In [85]:
# Check for duplicates in the specified column
duplicated_values = df['Original title'].duplicated(keep=False)

# Shows rows where 'Original title' has duplicated values
df[duplicated_values]

Unnamed: 0,IMBD title ID,Original title,Release year,Genre,Duration,Country,Content Rating,Director,Income,Votes,Score
