In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

In [2]:
data = pd.read_csv("data/movies_metadata.csv")

In [3]:
data = data[data['original_language']=='en']
data = data[['budget', 'genres','id', 'imdb_id','original_title',"title", 'popularity', 'release_date', 'revenue', 'runtime', 'vote_average', 'vote_count']]
data = data[(data['genres'] != "[]")]
data['genres'] = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
data = data[(data.T != 0).all()]
data

Unnamed: 0,budget,genres,id,imdb_id,original_title,title,popularity,release_date,revenue,runtime,vote_average,vote_count
0,30000000.0,"[Animation, Comedy, Family]",862.0,tt0114709,Toy Story,Toy Story,21.946943,30/10/1995,373554033.0,81.0,7.7,5415.0
1,65000000.0,"[Adventure, Fantasy, Family]",8844.0,tt0113497,Jumanji,Jumanji,17.015539,15/12/1995,262797249.0,104.0,6.9,2413.0
3,16000000.0,"[Comedy, Drama, Romance]",31357.0,tt0114885,Waiting to Exhale,Waiting to Exhale,3.859495,22/12/1995,81452156.0,127.0,6.1,34.0
5,60000000.0,"[Action, Crime, Drama, Thriller]",949.0,tt0113277,Heat,Heat,17.924927,15/12/1995,187436818.0,170.0,7.7,1886.0
8,35000000.0,"[Action, Adventure, Thriller]",9091.0,tt0114576,Sudden Death,Sudden Death,5.231580,22/12/1995,64350171.0,106.0,5.5,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...
45014,60000000.0,"[Action, Western, Science Fiction, Fantasy, Ho...",353491.0,tt1648190,The Dark Tower,The Dark Tower,50.903593,03/08/2017,71000000.0,95.0,5.7,688.0
45139,50000000.0,"[Comedy, Family, Animation]",378236.0,tt4877122,The Emoji Movie,The Emoji Movie,33.694599,28/07/2017,66913939.0,86.0,5.8,327.0
45167,11000000.0,"[Action, Crime, Mystery, Thriller]",395834.0,tt5362988,Wind River,Wind River,40.796775,03/08/2017,184770205.0,111.0,7.4,181.0
45409,800000.0,"[Comedy, Drama]",62757.0,tt0933361,Dikari,Savages,0.903061,23/11/2006,1328612.0,100.0,5.8,6.0


In [4]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
init_notebook_mode(connected=True)

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [6]:
scalar = MinMaxScaler()

scaled_df = data[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']]
smaller_df = scaled_df.copy()
scaled = scalar.fit_transform(data[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']])

scaled_df = pd.DataFrame(scaled, index=scaled_df.index, columns=scaled_df.columns)

scaled_df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count
0,0.078947,0.040087,0.133988,0.176282,0.850746,0.384681
1,0.171053,0.031079,0.094261,0.25,0.731343,0.17138
3,0.042105,0.007049,0.029216,0.323718,0.61194,0.002345
5,0.157895,0.03274,0.067231,0.461538,0.850746,0.133935
8,0.092105,0.009556,0.023081,0.25641,0.522388,0.012292


In [7]:
def apply_kmeans(df, clusters):
    kmeans = KMeans(n_clusters=clusters, random_state=0)
    cluster_labels = kmeans.fit(df).labels_
    string_labels = ["c{}".format(i) for i in cluster_labels]
    df['cluster_label'] = cluster_labels
    df['cluster_string'] = string_labels

    return df

In [8]:
def param_tune(df):
    scores = {'clusters': list(), 'score': list()}
    for cluster_num in range(1,31):
        scores['clusters'].append(cluster_num)
        scores['score'].append(KMeans(n_clusters=cluster_num, random_state=0).fit(df).score(df))

    scores_df = pd.DataFrame(scores)

    # fig = go.Figure(go.Scatter(
    #     x=scores_df['clusters'],
    #     y=scores_df['score']
    # ))

    # fig.update_layout(
    #     xaxis_title='Cluster',
    #     yaxis_title='Score',
    #     title='Elbow Method Results',
    #     height=800,
    #     width=800
    # )

    # fig.show()

    return 9

In [9]:
clusters = param_tune(scaled_df)
scaled_df = apply_kmeans(scaled_df, clusters)

In [10]:
smaller_df = smaller_df.join(scaled_df[['cluster_label', 'cluster_string']])
smaller_df = smaller_df.join(data[['title', 'genres']])

smaller_df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,cluster_label,cluster_string,title,genres
0,30000000.0,21.946943,373554033.0,81.0,7.7,5415.0,5,c5,Toy Story,"[Animation, Comedy, Family]"
1,65000000.0,17.015539,262797249.0,104.0,6.9,2413.0,5,c5,Jumanji,"[Adventure, Fantasy, Family]"
3,16000000.0,3.859495,81452156.0,127.0,6.1,34.0,8,c8,Waiting to Exhale,"[Comedy, Drama, Romance]"
5,60000000.0,17.924927,187436818.0,170.0,7.7,1886.0,7,c7,Heat,"[Action, Crime, Drama, Thriller]"
8,35000000.0,5.23158,64350171.0,106.0,5.5,174.0,3,c3,Sudden Death,"[Action, Adventure, Thriller]"


In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style

In [12]:
fig = px.scatter_matrix(smaller_df, dimensions=['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count'],
                        color='cluster_string', hover_data=['title', 'genres'])
fig.update_layout(
    title='Cluster Scatter Matrix',
    height=1000,
    width=800
)

iplot(fig)


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.

