In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

%matplotlib inline

In [2]:
#Import of IMDB data scraped in previous notebook
df = pd.read_pickle("./shows.pkl")

In [3]:
df.columns

Index(['link', 'imdb_rating', 'title', 's01_episodes', 'avg_runtime', 'genres',
       'rel_date', 'certification', 'origin', 'company', 'creators',
       'creators_a', 'stars', 'stars_a'],
      dtype='object')

### data cleaning, processing, and initial exploration

GENRES

In [4]:
genres_l = set(df.genres.sum())

In [9]:
def m(x):
    if i in x:
        return 1
    else:
        return 0
    
for i in genres_l:
    df[i] = df.genres.apply(m)

In [10]:
df[['imdb_rating'] + list(genres_l)].corr().iloc[:,0]

imdb_rating    1.000000
Thriller       0.034059
Music          0.014045
Comedy         0.022244
Biography      0.067982
Adventure     -0.001700
Horror        -0.100276
Documentary    0.027008
Western        0.036265
Sci-Fi        -0.105571
Romance        0.045513
Short         -0.053239
Mystery       -0.006094
Action         0.000980
Family        -0.048417
Game-Show     -0.053298
News           0.069006
Talk-Show     -0.009340
War            0.014827
Crime          0.038813
History        0.051376
Drama          0.033611
Reality-TV    -0.082501
Animation      0.038486
Musical       -0.054243
Sport          0.071994
Fantasy        0.004190
Name: imdb_rating, dtype: float64

In [11]:
df.rename(columns={"Talk-Show": "Talk_Show", "Sci-Fi": "Sci_Fi", "Game-Show": "Game_Show", "Reality-TV":"Reality_TV"}, inplace=True)

In [12]:
df['genres_n'] = (df.genres.apply(lambda x: len(x)))

RELEASE DATE

In [13]:
#Create a column for the release month of the TV show
df['rel_month']= pd.DatetimeIndex(df['rel_date']).month 

CERTIFICATION

In [14]:
#after exploration looks like over 457 of the shows have no certification info, still will keep that column to anlyse shows that have certification info
df.loc[df.certification.apply(len) > 7, 'certification'] = "0"
df.loc[df.certification == '', 'certification'] = "0"

#Unification of the certification categories
df.certification.replace({'TV-Y7': 'TV-PG', 'TV-G': 'TV-PG', 'TV-Y': 'TV-PG', 'PG': 'TV-PG', 'PG-13': 'TV-14'}, inplace=True)

df.certification.value_counts()

0        456
TV-MA    446
TV-14    314
TV-PG    139
Name: certification, dtype: int64

In [15]:
df['TV_PG'] = (df['certification'] == 'TV-PG').apply(int)
df['TV_14'] = (df['certification'] == 'TV-14').apply(int)
df['TV_MA'] = (df['certification'] == 'TV-MA').apply(int)

ORIGIN

In [16]:
#I simplyfy that column to boolean 'made_in_usa'
df.origin.value_counts()

United States             517
India                     158
United Kingdom            133
Japan                      85
South Korea                83
Turkey                     63
Spain                      28
Sweden                     20
Canada                     19
Italy                      17
Australia                  17
China                      17
France                     17
Norway                     15
Thailand                   14
Iran                       14
Germany                    13
Serbia                     12
Mexico                     12
Denmark                    11
Brazil                     10
Pakistan                    7
Egypt                       5
Taiwan                      5
Ireland                     5
South Africa                4
Russia                      4
Israel                      4
Belgium                     3
New Zealand                 3
Bangladesh                  3
Colombia                    3
Poland                      3
Netherland

In [17]:
df['made_in_usa'] = (df.origin == 'United States').apply(int)

In [18]:
df[['imdb_rating', 'made_in_usa']].corr()

Unnamed: 0,imdb_rating,made_in_usa
imdb_rating,1.0,-0.169572
made_in_usa,-0.169572,1.0


### PRODUCING COMPANY

In [19]:
from collections import Counter
comp = pd.DataFrame(Counter(df.company.sum()), index=[0]).T
comp[0].sort_values(ascending=False).head(50)

Netflix                                   68
Amazon Studios                            31
Studio Dragon                             23
Universal Television                      21
CBS Television Studios                    20
3 Arts Entertainment                      20
ABC Signature                             17
ALTBalaji                                 17
Warner Bros. Television                   16
Netflix Studios                           15
Applause Entertainment Ltd.               15
Lionsgate Television                      14
British Broadcasting Corporation (BBC)    14
BBC Studios                               14
20th Television                           11
JTBC Studios                              11
Home Box Office (HBO)                     10
Sony Pictures Television                  10
MX Player                                 10
20th Century Fox Television                9
Ay Yapim                                   9
Berlanti Productions                       8
Paramount 

In [20]:
#Create list of big companies based on the number of shows produced in the observed time period (minimum of 10 shows)
company_l = ['Netflix', 'Netflix Studios', 'Amazon Studios', 'Studio Dragon', 'Universal Television', 'CBS Television Studios', '3 Arts Entertainment', 'ABC Signature', 'ALTBalaji', 'Warner Bros. Television', ' Applause Entertainment Ltd.', 'Lionsgate Television', 'British Broadcasting Corporation (BBC)', 'BBC Studios', '20th Television', 'JTBC Studios', 'Home Box Office (HBO)', 'Sony Pictures Television', 'MX Player', '20th Century Fox Television', 'Paramount Television', 'Fox Entertainment', 'Netflix Animation', 'Warner Bros. Animation']

def c(x):
    for i in company_l:
        if i in x:
            return 1
        else:
            return 0
    

df['big_c'] = df.company.apply(c)

df[df.big_c==1]

Unnamed: 0,link,imdb_rating,title,s01_episodes,avg_runtime,genres,rel_date,certification,origin,company,...,Musical,Sport,Fantasy,genres_n,rel_month,TV_PG,TV_14,TV_MA,made_in_usa,big_c
2,https://www.imdb.com/title/tt8115560,6.6,Tidying Up with Marie Kondo,8,40,[Reality-TV],2019-01-01,TV-PG,United States,"[Netflix, The Jackal Group]",...,0,0,0,1,1,1,0,0,1,1
51,https://www.imdb.com/title/tt1312171,8.0,The Umbrella Academy,10,60,"[Action, Adventure, Comedy]",2019-02-15,TV-14,United States,"[Dark Horse Entertainment, Netflix, Universal ...",...,0,0,0,3,2,0,1,0,1,1
86,https://www.imdb.com/title/tt9561862,8.4,"Love, Death & Robots",18,15,"[Animation, Short, Action]",2019-03-15,TV-MA,United States,"[Blur Studio, Netflix Studios, Netflix]",...,0,0,0,3,3,0,0,1,1,1
114,https://www.imdb.com/title/tt9184994,7.9,How to Sell Drugs Online (Fast),6,30,"[Comedy, Crime, Drama]",2019-05-31,TV-MA,Germany,"[Bildundtonfabrik, Netflix]",...,0,0,0,3,5,0,0,1,0,1
123,https://www.imdb.com/title/tt7087260,7.4,Tales of the City,10,60,[Drama],2019-06-07,TV-MA,United States,"[Netflix, Sweetpants Productions, Working Titl...",...,0,0,0,1,6,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,https://www.imdb.com/title/tt14163838,6.4,Feels Like Ishq,6,32,"[Comedy, Drama, Romance]",2021-07-23,TV-MA,India,"[Awesomeness TV, Mutant Films, Netflix]",...,0,0,0,3,7,0,0,1,0,1
1187,https://www.imdb.com/title/tt11834150,7.2,The Chair,6,30,"[Comedy, Drama]",2021-08-20,TV-MA,United States,"[BLB Media, Netflix Studios, Netflix]",...,0,0,0,2,8,0,0,1,1,1
1201,https://www.imdb.com/title/tt5540990,6.4,On the Verge,12,35,[Comedy],2021-09-06,TV-14,France,"[The Film TV, Canal+, Netflix]",...,0,0,0,1,9,0,1,0,0,1
1233,https://www.imdb.com/title/tt13278100,7.0,Braqueurs,6,44,"[Action, Crime, Drama]",2021-09-24,TV-MA,France,"[Labyrinthe Films, Netflix]",...,0,0,0,3,9,0,0,1,0,1


In [21]:
df['company_n'] = (df.company.apply(lambda x: len(x)))

AWARDS

In [22]:
#Calculating the number of awards for each creator and star, and summing them to create a new feature representing the total awards of the show's cast and crew
df['creators_a_n']= df.creators_a.apply(sum) 
df['stars_a_n']= df.stars_a.apply(sum)
df['awards_n'] = df['creators_a_n'] + df['stars_a_n']

df.head()

Unnamed: 0,link,imdb_rating,title,s01_episodes,avg_runtime,genres,rel_date,certification,origin,company,...,rel_month,TV_PG,TV_14,TV_MA,made_in_usa,big_c,company_n,creators_a_n,stars_a_n,awards_n
0,https://www.imdb.com/title/tt7670568,5.8,The Masked Singer,10,60,"[Game-Show, Music, Reality-TV]",2019-01-02,TV-PG,United States,"[Smart Dog Media, Fox Alternative Entertainmen...",...,1,1,0,0,1,0,3,0,9,9
1,https://www.imdb.com/title/tt8001250,6.2,Siempre Bruja,11,40,"[Drama, Fantasy]",2019-01-01,TV-14,Colombia,[Caracol],...,1,0,1,0,0,0,1,0,0,0
2,https://www.imdb.com/title/tt8115560,6.6,Tidying Up with Marie Kondo,8,40,[Reality-TV],2019-01-01,TV-PG,United States,"[Netflix, The Jackal Group]",...,1,1,0,0,1,1,2,1,1,2
3,https://www.imdb.com/title/tt8324422,8.1,PEN15,10,30,[Comedy],2019-02-08,TV-MA,United States,"[Awesomeness TV, Odenkirk Provissiero Entertai...",...,2,0,0,1,1,0,3,6,7,13
4,https://www.imdb.com/title/tt8888322,6.6,Bloom,6,60,"[Drama, Mystery, Sci-Fi]",2019-01-01,0,Australia,[Playmaker Media],...,1,0,0,0,0,0,1,5,12,17


In [23]:
df.corr().iloc[:1]

Unnamed: 0,imdb_rating,s01_episodes,avg_runtime,Thriller,Music,Comedy,Biography,Adventure,Horror,Documentary,...,rel_month,TV_PG,TV_14,TV_MA,made_in_usa,big_c,company_n,creators_a_n,stars_a_n,awards_n
imdb_rating,1.0,0.036943,0.076961,0.034059,0.014045,0.022244,0.067982,-0.0017,-0.100276,0.027008,...,0.028357,-0.105924,-0.025684,-0.031788,-0.169572,-0.034374,-0.038182,0.001824,0.058789,0.050013


In [24]:
#It does not seem interesting to explore particular names as in explored time period only 12 artists manageged to work in more than 3 shows
(pd.Series((df.creators+df.stars).sum()).value_counts() > 3).sum() 

12

In [25]:
df.columns

Index(['link', 'imdb_rating', 'title', 's01_episodes', 'avg_runtime', 'genres',
       'rel_date', 'certification', 'origin', 'company', 'creators',
       'creators_a', 'stars', 'stars_a', 'Thriller', 'Music', 'Comedy',
       'Biography', 'Adventure', 'Horror', 'Documentary', 'Western', 'Sci_Fi',
       'Romance', 'Short', 'Mystery', 'Action', 'Family', 'Game_Show', 'News',
       'Talk_Show', 'War', 'Crime', 'History', 'Drama', 'Reality_TV',
       'Animation', 'Musical', 'Sport', 'Fantasy', 'genres_n', 'rel_month',
       'TV_PG', 'TV_14', 'TV_MA', 'made_in_usa', 'big_c', 'company_n',
       'creators_a_n', 'stars_a_n', 'awards_n'],
      dtype='object')

### pickling shows list to save progress and to follow processing in another notebook

In [27]:
#df.to_pickle("shows2.pkl")