In [3]:
import pandas as pd
import sqlalchemy as db

connection_str = f'mysql+pymysql://root:admin@172.17.0.2:3306/imdb'
engine = db.create_engine(connection_str)
conn = engine.connect()

In [4]:
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly

pio.renderers.default = 'notebook'

In [5]:
query = '''
    SELECT tb.primaryTitle, tb.tconst, tb.genres, tr.averageRating, tr.numVotes
    FROM title_basics tb
    LEFT JOIN title_ratings tr ON tb.tconst = tr.tconst
    WHERE tr.numVotes >= 1900
'''
df = pd.read_sql(query, conn)

In [6]:
with open('../treated_datasets/most_successful_10perc.txt', 'r') as f:
    top_tconsts = f.read().split('\n')

In [7]:
len(df)

4076

In [8]:
to_delete = []
for index, row in df.iterrows():
    if row['tconst'] in top_tconsts:
        to_delete.append(index)
df = df.drop(labels=to_delete)
len(df)

3669

In [9]:
edges_dict = dict()

for _, row in df.iterrows():
    genres = row['genres'].split(',')

    if type(genres) == list and len(genres) >= 2:
        genres = sorted(genres)

        key = ','.join([genres[0], genres[1]])
        if key not in edges_dict.keys():
            edges_dict[key] = 1
        else:
            edges_dict[key] += 1

        if len(genres) == 3:
            key = ','.join([genres[0], genres[2]])
            if key not in edges_dict.keys():
                edges_dict[key] = 1
            else:
                edges_dict[key] += 1

            key = ','.join([genres[1], genres[2]])
            if key not in edges_dict.keys():
                edges_dict[key] = 1
            else:
                edges_dict[key] += 1

len(edges_dict)


198

In [10]:
edges_df = pd.DataFrame(columns=['Source', 'Target', 'Weight'])
for key, value in edges_dict.items():
    g1, g2 = key.split(',')
    edges_df = edges_df.append({'Source': g1, 'Target': g2, 'Weight': value}, ignore_index=True)

len(edges_df)

198

In [11]:
edges_df.head()

Unnamed: 0,Source,Target,Weight
0,Action,Crime,161
1,Action,Drama,299
2,Crime,Drama,540
3,Action,Mystery,14
4,Drama,Mystery,316


In [12]:
edges_df.index.names = ['Id']
edges_df.to_csv('../treated_datasets/genres_conns_edges_bottom90.csv')

In [14]:
px.box(edges_df, y='Weight')

In [22]:
edges_df_cut = edges_df.sort_values('Weight', ascending=False)[:15]
edges_df_cut.index.names = ['Id']
edges_df_cut

Unnamed: 0_level_0,Source,Target,Weight
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Crime,Drama,540
5,Comedy,Drama,496
11,Action,Adventure,337
4,Drama,Mystery,316
24,Action,Animation,305
1,Action,Drama,299
23,Adventure,Animation,292
10,Animation,Comedy,266
25,Drama,Romance,260
30,Crime,Mystery,208


In [17]:
edges_df_cut.to_csv('../treated_datasets/genres_conns_edges_bottom90_cut.csv')

In [11]:
connected = 0
not_connected = 0

for _, row in df.iterrows():
    genres = row['genres'].split(',')

    if type(genres) == list and 'Crime' in genres:
        if 'Mystery' in genres:
            connected += 1
        else:
            not_connected += 1
            
print(connected)
print(not_connected)

208
491
