In [3]:
import pandas as pd
import sqlalchemy as db

connection_str = f'mysql+pymysql://root:admin@172.17.0.2:3306/imdb'
engine = db.create_engine(connection_str)
conn = engine.connect()

import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'notebook'

In [4]:
query = '''
    SELECT genres FROM title_basics
'''
df = pd.read_sql(query, conn)

In [5]:
df.head(10)

Unnamed: 0,genres
0,"Action,Adventure,Animation"
1,
2,"Adventure,Comedy,Crime"
3,"Action,Crime,Drama"
4,"Action,Crime,Drama"
5,"Comedy,Talk-Show"
6,Documentary
7,
8,"Comedy,Crime,Fantasy"
9,"Biography,Drama,History"


In [13]:
distinct_genres = []
for x in df['genres']:
    if x is not None:
        distinct_genres += x.split(',')
        
distinct_genres = set(distinct_genres)
print(len(distinct_genres))

27


In [14]:
distinct_genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [60]:
query = '''
    SELECT tb.tconst, tb.genres, tr.averageRating, tr.numVotes
    FROM title_basics tb
    LEFT JOIN title_ratings tr ON tb.tconst = tr.tconst
    WHERE tr.numVotes > 1500
'''
df = pd.read_sql(query, conn)

In [61]:
with open('../treated_datasets/most_successful_333.txt', 'r') as f:
    top_tconsts = f.read().split('\n')

In [62]:
df.head()

Unnamed: 0,tconst,genres,averageRating,numVotes
0,tt0160904,"Action,Crime,Drama",8.3,16555
1,tt0204993,"Action,Drama,Mystery",7.3,32959
2,tt0205700,"Comedy,Drama",7.6,5863
3,tt0206476,"Action,Fantasy,Sci-Fi",5.8,2152
4,tt0206511,"Comedy,Family",7.6,14725


In [64]:
genre_dict = dict()
for g in distinct_genres:
    genre_dict[g] = dict()
    
    genre_dict[g]['count'] = 0
    genre_dict[g]['sumNumVotes'] = 0
    genre_dict[g]['sumAvgRating'] = 0
    
def try_float(n):
    try:
        float(n)
        return True
    except:
        return False

for _, row in df.iterrows():
    #if row['tconst'] in top_tconsts and row['genres'] is not None:
    if row['genres'] is not None and try_float(row['numVotes']) and try_float(row['averageRating']):
        genres = row['genres'].split(',')
        for g in genres:
            genre_dict[g]['count'] += 1
            genre_dict[g]['sumNumVotes'] += row['numVotes']
            genre_dict[g]['sumAvgRating'] += row['averageRating']
        

In [65]:
plot_df = pd.DataFrame({
    'genre': list(distinct_genres),
    'count': [genre_dict[g]['count'] for g in distinct_genres],
    'sumNumVotes': [genre_dict[g]['sumNumVotes'] for g in distinct_genres],
    'sumAvgRating': [genre_dict[g]['sumAvgRating'] for g in distinct_genres],
})    
plot_df.head()

Unnamed: 0,genre,count,sumNumVotes,sumAvgRating
0,Sci-Fi,202,6774290,1401.5
1,Fantasy,336,9702418,2430.9
2,Talk-Show,82,574642,588.7
3,Comedy,1902,29380681,13881.6
4,Thriller,353,9245665,2585.9


In [66]:
plot_df.sort_values('count', ascending=False)

Unnamed: 0,genre,count,sumNumVotes,sumAvgRating
8,Drama,2458,66961195,18068.1
3,Comedy,1902,29380681,13881.6
21,Action,921,25834967,6707.0
9,Crime,905,27882121,6739.8
12,Animation,736,9572798,5394.4
15,Adventure,653,19129319,4709.0
7,Mystery,493,15539752,3626.0
13,Romance,491,8337167,3606.4
4,Thriller,353,9245665,2585.9
1,Fantasy,336,9702418,2430.9


In [67]:
plot_df['avgNumVotes'] = plot_df.apply(lambda x: x['sumNumVotes'] / x['count'] if x['count'] > 0 else 0, axis=1)
plot_df['avgAvgRating'] = plot_df.apply(lambda x: x['sumAvgRating'] / x['count'] if x['count'] > 0 else 0, axis=1)

plot_df.head()

Unnamed: 0,genre,count,sumNumVotes,sumAvgRating,avgNumVotes,avgAvgRating
0,Sci-Fi,202,6774290,1401.5,33536.089109,6.938119
1,Fantasy,336,9702418,2430.9,28876.244048,7.234821
2,Talk-Show,82,574642,588.7,7007.829268,7.179268
3,Comedy,1902,29380681,13881.6,15447.256046,7.298423
4,Thriller,353,9245665,2585.9,26191.685552,7.325496


In [68]:
plot_df = plot_df[plot_df['count'] > 0]

In [70]:
fig = px.scatter(
    plot_df, x='avgNumVotes', y='avgAvgRating', size='count',
    custom_data=['count', 'genre'],
    title='Gender grouping for > 1500 votes'
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Count: %{customdata[0]}",
        "Genre: %{customdata[1]}",
    ])
)
fig.show()

top 333

In [71]:
genre_dict = dict()
for g in distinct_genres:
    genre_dict[g] = dict()
    
    genre_dict[g]['count'] = 0
    genre_dict[g]['sumNumVotes'] = 0
    genre_dict[g]['sumAvgRating'] = 0
    
def try_float(n):
    try:
        float(n)
        return True
    except:
        return False

for _, row in df.iterrows():
    if row['tconst'] in top_tconsts and row['genres'] is not None:
        genres = row['genres'].split(',')
        for g in genres:
            genre_dict[g]['count'] += 1
            genre_dict[g]['sumNumVotes'] += row['numVotes']
            genre_dict[g]['sumAvgRating'] += row['averageRating']
        

In [72]:
plot_df['avgNumVotes'] = plot_df.apply(lambda x: x['sumNumVotes'] / x['count'] if x['count'] > 0 else 0, axis=1)
plot_df['avgAvgRating'] = plot_df.apply(lambda x: x['sumAvgRating'] / x['count'] if x['count'] > 0 else 0, axis=1)

plot_df.head()

Unnamed: 0,genre,count,sumNumVotes,sumAvgRating,avgNumVotes,avgAvgRating
0,Sci-Fi,202,6774290,1401.5,33536.089109,6.938119
1,Fantasy,336,9702418,2430.9,28876.244048,7.234821
2,Talk-Show,82,574642,588.7,7007.829268,7.179268
3,Comedy,1902,29380681,13881.6,15447.256046,7.298423
4,Thriller,353,9245665,2585.9,26191.685552,7.325496


In [74]:
plot_df = plot_df[plot_df['count'] > 0]

fig = px.scatter(
    plot_df, x='avgNumVotes', y='avgAvgRating', size='count',
    custom_data=['count', 'genre'],
    title='Gender grouping for top 333 tv-series'
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Count: %{customdata[0]}",
        "Genre: %{customdata[1]}",
    ])
)
fig.show()