In [37]:
import pandas as pd
import numpy as np

import plotly.express as px

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

# from pandas_profiling import ProfileReport

# First steps

In [38]:
imdb_df = pd.read_csv('data/imdb_df_1949-12-31_2022-05-01_new.csv')

In [39]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390545 entries, 0 to 390544
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        390545 non-null  int64  
 1   actors            390545 non-null  object 
 2   director          385359 non-null  object 
 3   duration          306624 non-null  float64
 4   genre             364402 non-null  object 
 5   imdb_rating       272038 non-null  float64
 6   link              390545 non-null  object 
 7   metascore         17566 non-null   float64
 8   release_date      390405 non-null  float64
 9   synopsis          390536 non-null  object 
 10  title             390544 non-null  object 
 11  votes             272038 non-null  float64
 12  page_url          390545 non-null  object 
 13  page_url_cleaned  390545 non-null  object 
dtypes: float64(5), int64(1), object(8)
memory usage: 41.7+ MB


In [40]:
imdb_df.page_url_cleaned.nunique()

868

In [41]:
# dropping duplicated films
imdb_df = imdb_df.drop_duplicates(subset=['link'], ignore_index=True)
imdb_df = imdb_df.drop(['Unnamed: 0'], axis=1)

In [42]:
imdb_df.shape

(312521, 13)

In [43]:
# creating variable release_month
imdb_df['release_start'] = [float(str(i)[0:4]) if i>3000 else float(str(i)[0:4]) for i in imdb_df.release_date]
imdb_df['release_end'] = [float(str(i)[4:8]) if i>3000 else float(str(i)[0:4]) for i in imdb_df.release_date]
imdb_df['release_month'] = [int(page[-5:len(page)][:2]) for page in imdb_df['page_url_cleaned']]

In [44]:
imdb_df.head()

Unnamed: 0,actors,director,duration,genre,imdb_rating,link,metascore,release_date,synopsis,title,votes,page_url,page_url_cleaned,release_start,release_end,release_month
0,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'...",Joseph H. Lewis,87.0,"Crime, Drama, Film-Noir",7.6,https://www.imdb.com/title/tt0042530/,74.0,1950.0,Two disturbed young people release their fasci...,Gun Crazy,13192.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1950.0,1
1,"['Raoul Walsh', 'Errol Flynn', 'Alexis Smith',...",Ray Enright,76.0,Western,6.1,https://www.imdb.com/title/tt0042744/,,1950.0,An Australian sheepman comes to Montana lookin...,Montana,1215.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1950.0,1
2,"['Randolph Scott', 'Dorothy Malone', 'Forrest ...",Gordon Douglas,81.0,Western,6.3,https://www.imdb.com/title/tt0042782/,,1950.0,A mysterious stranger crosses paths with an ou...,The Nevadan,919.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1950.0,1
3,"['Gene Tierney', 'Richard Conte', 'José Ferrer...",Otto Preminger,98.0,"Crime, Drama, Film-Noir",6.7,https://www.imdb.com/title/tt0042039/,,1950.0,A woman suffering from kleptomania is hypnotiz...,Whirlpool,4206.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1950.0,1
4,"['Robert Preston', 'Robert Sterling', 'Chill W...",George Templeton,83.0,Western,5.6,https://www.imdb.com/title/tt0043013/,,1950.0,Brother is pitted against brother in this tale...,The Sundowners,480.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1950.0,1


In [45]:
# creating a variable indicating whether the movie is a series
imdb_df['tv_series'] = np.where(imdb_df.release_start == imdb_df.release_end, 0, 1)

In [46]:
# removing rows with unavailable ratings
imdb_df = imdb_df.dropna(subset=['imdb_rating']).reset_index(drop=True)
# getting rid of release_date, release_end and metascore
imdb_df = imdb_df.drop(['release_date', 'release_end', 'metascore'], axis=1)
# getting rid of outlier durations
imdb_df = imdb_df[imdb_df.duration <= 2000].reset_index(drop=True)

# Encoding

Here we use different encoders to transform the variables.

## Actors

In [47]:
print('Unique elements', (imdb_df['actors'].nunique()))
# Calculating the number of top cast
n_actors = [len(eval(imdb_df['actors'][i])) for i in range(imdb_df.shape[0])]
imdb_df['n_actors'] = n_actors
px.histogram(imdb_df, 'n_actors')

Unique elements 192785


In [48]:
# As there are films, where the actors' tag was not correctly identified (actors instead of the directors), the number of actors is huge.
# I will drop the films which have more than 7 actors, and don't have actors at all
print(imdb_df.shape)
imdb_df = imdb_df[(imdb_df.n_actors <= 4) & (imdb_df.n_actors != 0)].reset_index(drop=True)
print(imdb_df.shape)

(200360, 15)
(183967, 15)


In [49]:
px.histogram(imdb_df, 'n_actors')

In [50]:
# creating an empty dataframe which will be filled with encoded variables
imdb_clean = pd.DataFrame()

In [51]:
imdb_df['actors'] = [eval(i) for i in imdb_df['actors']]
imdb_clean[['actor1', 'actor2', 'actor3', 'actor4']] = pd.DataFrame(imdb_df['actors'].tolist())

In [52]:
for i in ['actor1', 'actor2', 'actor3', 'actor4']:
    # creating instance of labelencoder
    labelencoder = LabelEncoder()
    # Assigning numerical values and storing in another column
    imdb_clean[i] = labelencoder.fit_transform(imdb_clean[i])

In [53]:
imdb_clean.describe()

Unnamed: 0,actor1,actor2,actor3,actor4
count,183967.0,183967.0,183967.0,183967.0
mean,41289.290264,47702.309615,52101.901102,53850.86497
std,23631.984866,27346.127508,29810.515792,28670.786719
min,0.0,0.0,0.0,0.0
25%,20821.0,24124.5,26433.5,28738.5
50%,41121.0,47461.0,51950.0,56863.0
75%,62034.0,71513.5,78088.5,85033.5
max,82207.0,93092.0,99348.0,87466.0


## Director

In [54]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
imdb_clean['director_enc'] = labelencoder.fit_transform(imdb_df['director'])

In [55]:
imdb_clean.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director_enc
0,37241,68568,10147,59050,33767
1,63173,22342,29027,25837,23232
2,25854,72711,46756,13293,50165
3,65473,74085,15914,12858,22254
4,32322,21601,43406,69447,6356


## Genre

In [56]:
genres = [i.split(', ') if i == i else ['Unknown'] for i in imdb_df['genre']]
mlb = MultiLabelBinarizer()
imdb_clean[mlb.classes_] = pd.DataFrame(mlb.fit_transform(genres), columns = mlb.classes_)

In [57]:
imdb_clean.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'Action',
       'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show',
       'Thriller', 'Unknown', 'War', 'Western'],
      dtype='object')

# Creating final df

In [58]:
imdb_df.columns = [i.lower() for i in imdb_df.columns]
imdb_clean.columns = [i.lower() for i in imdb_clean.columns]

In [59]:
imdb_clean[['link', 'genre', 'duration', 'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series', 'title', 'synopsis', 'director', 'actors']] = \
    imdb_df[['link', 'genre', 'duration', 'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series', 'title', 'synopsis', 'director', 'actors']]

In [64]:
imdb_clean.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors'],
      dtype='object')

In [60]:
imdb_clean.isna().sum()

actor1              0
actor2              0
actor3              0
actor4              0
director_enc        0
action              0
adult               0
adventure           0
animation           0
biography           0
comedy              0
crime               0
documentary         0
drama               0
family              0
fantasy             0
film-noir           0
game-show           0
history             0
horror              0
music               0
musical             0
mystery             0
news                0
reality-tv          0
romance             0
sci-fi              0
short               0
sport               0
talk-show           0
thriller            0
unknown             0
war                 0
western             0
link                0
genre            2652
duration            0
imdb_rating         0
votes               0
release_start       8
release_month       0
tv_series           0
title               1
synopsis            5
director            0
actors    

In [61]:
imdb_clean.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director_enc,action,adult,adventure,animation,biography,...,duration,imdb_rating,votes,release_start,release_month,tv_series,title,synopsis,director,actors
0,37241,68568,10147,59050,33767,0,0,0,0,0,...,87.0,7.6,13192.0,1950.0,1,0,Gun Crazy,Two disturbed young people release their fasci...,Joseph H. Lewis,"[John Dall, Peggy Cummins, Berry Kroeger, Morr..."
1,63173,22342,29027,25837,23232,0,0,0,0,0,...,81.0,6.3,919.0,1950.0,1,0,The Nevadan,A mysterious stranger crosses paths with an ou...,Gordon Douglas,"[Randolph Scott, Dorothy Malone, Forrest Tucke..."
2,25854,72711,46756,13293,50165,0,0,0,0,0,...,98.0,6.7,4206.0,1950.0,1,0,Whirlpool,A woman suffering from kleptomania is hypnotiz...,Otto Preminger,"[Gene Tierney, Richard Conte, José Ferrer, Cha..."
3,65473,74085,15914,12858,22254,0,0,0,0,0,...,83.0,5.6,480.0,1950.0,1,0,The Sundowners,Brother is pitted against brother in this tale...,George Templeton,"[Robert Preston, Robert Sterling, Chill Wills,..."
4,32322,21601,43406,69447,6356,0,0,0,0,0,...,84.0,6.8,2113.0,1950.0,1,0,The Blue Lamp,The daily routine of two London Policemen is i...,Basil Dearden,"[Jack Warner, Dirk Bogarde, Jimmy Hanley, Robe..."


In [62]:
imdb_df = imdb_df.drop(['n_actors'], axis=1)

In [63]:
imdb_df.to_csv('data/imdb_not_encoded.csv', index=False)
imdb_clean.to_csv('data/imdb_encoded.csv', index=False)