In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from collections import defaultdict
import random
import pydot
from io import StringIO
import pydotplus
import plotly.plotly as py
import plotly.graph_objs as go

In [93]:
df = pd.read_csv('movie_metadata.csv')

In [94]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [95]:
print(df.isnull().sum()) # columns with missing data

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64


In [96]:
def get_movie_class(row):
    if 8 <= row['imdb_score'] <= 10:
        row['Class'] = 'great'
    elif 7 <= row['imdb_score'] < 8:
        row['Class'] = 'good'
    elif 6 <= row['imdb_score'] < 7:
        row['Class'] = 'average'
    else:
        row['Class'] = 'bad'
    return row


In [97]:
def print_metrics(y_test, y_pred, threshold=0.5):
    print("Precision", metrics.precision_score(y_test, y_pred > threshold))
    print("Recall", metrics.recall_score(y_test, y_pred > threshold))
    print("F1", metrics.f1_score(y_test, y_pred > threshold))
    print("AUC", metrics.roc_auc_score(y_test, y_pred_lr))

In [98]:
def build_decision_tree(df):

    df = df.dropna()
    df = df.reset_index()
    df = df.apply(get_movie_class, axis=1)  # for each row
    df_before_split = df.copy()
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    for train_index, test_index in split.split(df, df['Class']):
        train_set = df.loc[train_index]
        test_set = df.loc[test_index]
        Y_train = train_set.Class
    X_train = train_set[train_set.columns.drop('Class').drop('index')]
    Y_test = test_set.Class
    X_test = test_set[test_set.columns.drop('Class').drop('index')]
    
    print(X_train.columns)
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, Y_train)
    print('Accuracy', decision_tree.score(X_test, Y_test))
    # Draw graph
    dot_data = StringIO()
    export_graphviz(decision_tree, out_file=dot_data,
                    filled=True, rounded=True,
                    special_characters=True,impurity=False,
                    feature_names=train_set.columns.drop('Class').drop('index'))
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png("dtree.png")
    
    #print(feature_names)

In [99]:
def load_datas():
    df = pd.read_csv('movie_metadata.csv')
    df = df[['gross', 'imdb_score']].dropna()
    dataset = df.values.tolist()
    clusters = kMedoids(dataset, 5, np.inf, 0)

    for i in range(0, len(clusters.keys())):
        print("Cluster ", i, "= ", len(clusters.get(i)))

    build_decision_tree(df)

In [100]:
def kMedoids(data, k, prev_cost, count, clusters=None, medoids=None):

    cluster_sum = 0

    while True:

        if medoids is None or not medoids:
            medoids = random.sample(data, 5)
        else:
            random.shuffle(medoids)
            medoids.pop()
            medoids.pop()
            medoids.pop()
            medoids += random.sample(data, 3)

        clusters = defaultdict(list)

        for item in data:
            temp = []
            for i in range(0, len(medoids)):
                med = medoids[i]
                if med is None or not med:
                    break
                else:
                    temp.append(np.linalg.norm(
                        med[0]-item[0])+np.linalg.norm(med[1]-item[1]))
            min_index = np.argmin(temp)
            clusters[min_index].append(item)

        for i in range(0, len(medoids)):
            inter_cluster = clusters[i]
            for j in range(0, len(inter_cluster)):
                item_cluster = inter_cluster[j]
                medoid = medoids[i]
                cluster_sum += (np.linalg.norm(medoid[0]-item_cluster[0]) +
                                np.linalg.norm(medoid[1]-item_cluster[1]))

        if cluster_sum < prev_cost:
            prev_cost = cluster_sum
        else:
            break

        count += 1

    return clusters

In [101]:
def plot_graph(data):
    x = [1, 2, 3, 3, 45]
    y = [5, 6, 7, 8, 9]
    lb = 'cluster0'
    colors = np.random.rand(5)
    area = (30 * np.random.rand(5))**2  # 0 to 15 point radii
    plt.scatter(x, y, s=area, c=colors, alpha=0.5)
    # plt.scatter(x, y, label=lb, color='k', s=100)
    plt.xlabel('IMDb Scores')
    plt.ylabel('Gross')
    plt.title('K-medoid clusters')
    plt.legend()
    plt.show()

In [102]:
if __name__ == "__main__":
    load_datas()
    #plot_graph()

Cluster  0 =  1331
Cluster  1 =  599
Cluster  2 =  811
Cluster  3 =  1054
Cluster  4 =  364
Index(['gross', 'imdb_score'], dtype='object')
Accuracy 1.0
