In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score

import numpy as np
import random

import seaborn as sns
import matplotlib.pyplot as plt
import time

import warnings
warnings.filterwarnings("error")

from multiprocessing import Pool

In [2]:
movies_TFIDF = pd.read_csv('3a movies_TFIDF.csv')
movies_embedded = pd.read_csv('3b movies_Embedded.csv')
country_codes = pd.read_csv('country_codes.csv')

In [3]:
## Drop the remaining NaN values
# movies_TFIDF.dropna(inplace = True)
# movies_TFIDF.to_csv('3a movies_TFIDF.csv', index=False)
# movies_embedded.dropna(inplace = True)
# movies_embedded.to_csv('3b movies_Embedded.csv', index=False)

In [4]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

def prepare_dataframe(movies):
    
    movies = movies.drop(['id', 'title'], axis=1)
    
    # Categorical values are not used, because of the algorithm
    numeric = movies.drop(['adult', 'genres', 'original_language', 'production_companies', 'production_countries'], axis=1)
    
    # Scale the numeric values to (0, 1) interval
    scaler = MinMaxScaler()
    numeric = scaler.fit_transform(numeric)
    
    movievectors = pd.DataFrame(numeric)
    
    return movievectors

def birch_optim(thres, branch):
    try:
    
        # Determining score

        birch_TFIDF = Birch(threshold=thres, branching_factor=branch, n_clusters=35)
        birch_TFIDF.fit(movies_TFIDF)

        end = time.time()

        tfscore = silhouette_score(movies_TFIDF, birch_TFIDF.predict(movies_TFIDF))

        return tfscore

    except:
        # Error
        return -2

In [5]:
movies_TFIDF = prepare_dataframe(movies_TFIDF)
movies_embedded = prepare_dataframe(movies_embedded)

Use silhouette score for test purposes:

In [5]:
# 1. test: TF-IDF vs Embedded Layers and measure runtime
thres = 0.25
df1 = pd.DataFrame(columns=['TF-IDF', 'TF-IDF runtime',
                                   'Embedded Layers', 'Embedded Layers runtime'])

# Birch
start1 = time.time()
birch_TFIDF = Birch(threshold=thres)
birch_TFIDF.fit(movies_TFIDF)
end1 = time.time()
    
start2 = time.time()
birch_embedded = Birch(threshold=thres)
birch_embedded.fit(movies_embedded)
end2 = time.time()
    
tfscore = silhouette_score(movies_TFIDF, birch_TFIDF.predict(movies_TFIDF))
elapsed1 = end1-start1
    
emscore = silhouette_score(movies_embedded, birch_embedded.predict(movies_embedded))
elapsed2 = end2-start2
    
scores = pd.DataFrame([[tfscore, elapsed1, emscore, elapsed2]],
                          columns=['TF-IDF', 'TF-IDF runtime',
                                   'Embedded Layers', 'Embedded Layers runtime'])
df1 = pd.concat([df1, scores], axis=0)

In [6]:
df1 = df1.reset_index().drop('index', axis=1)

In [7]:
df1

Unnamed: 0,TF-IDF,TF-IDF runtime,Embedded Layers,Embedded Layers runtime
0,0.314061,4.465124,0.054603,75.022683


In [7]:
# 2. test: Optimizing birch clustering parameters with hill-climbing method from different starting points

# Initialize variables
silscore = np.zeros(5)
e = 0.05 # epsilon
branchlistlength = 60
branchlist = np.linspace(2, 500, branchlistlength).round().astype(int)

start = time.time()

for run in range(5):
    iter = -1
    # Random point
    thres = random.randint(2, 10)/20 # 0.1, 0.15, 0.20 ... 0.45, 0.5
    branch = random.choice(branchlist)

    while True:
        iter = iter + 1

        # Determining score
        silscore[0] = birch_optim(thres, branch)

        # Determining neighbours and their scores
        if np.where(branchlist==branch)[0][0]+1 < branchlistlength:
            silscore[1] = birch_optim(thres, branchlist[np.where(branchlist==branch)[0][0]+1])
        else:
            silscore[1] = -2

        silscore[2] = birch_optim(thres+e, branch)

        if np.where(branchlist==branch)[0][0]-1 >= 0: # 0 index is the first element
            silscore[3] = birch_optim(thres, branchlist[np.where(branchlist==branch)[0][0]-1])
        else:
            silscore[3] = -2

        if thres-e >= 0.05:
            silscore[4] = birch_optim(thres-e, branch)
        else:
            silscore[4] = -2

        print('Run: {}, Iteration: {}, Silhouette Score: {:0.4f}, Threshold: {:0.2f}, Branching factor: {}, Elapsed time: {:0.0f},'
          .format(run, iter, silscore[0], thres, branch, time.time()-start))

        if silscore.max() == -2: # Error
            thres = thres-e
            continue

        maxindex = np.argmax(silscore)
        if maxindex == 0:
            break
        if maxindex == 1:
            branch = branchlist[np.where(branchlist==branch)[0][0]+1]
        elif maxindex == 2:
            thres = thres+e
        elif maxindex == 3:
            branch = branchlist[np.where(branchlist==branch)[0][0]-1]
        elif maxindex == 4:
            thres = thres-e
            
    print()

Run: 0, Iteration: 0, Silhouette Score: 0.0243, Threshold: 0.20, Branching factor: 171, Elapsed time: 162,
Run: 0, Iteration: 1, Silhouette Score: 0.1138, Threshold: 0.25, Branching factor: 171, Elapsed time: 294,
Run: 0, Iteration: 2, Silhouette Score: 0.1205, Threshold: 0.30, Branching factor: 171, Elapsed time: 412,
Run: 0, Iteration: 3, Silhouette Score: 0.1351, Threshold: 0.35, Branching factor: 171, Elapsed time: 534,
Run: 0, Iteration: 4, Silhouette Score: 0.2021, Threshold: 0.40, Branching factor: 171, Elapsed time: 626,

Run: 1, Iteration: 0, Silhouette Score: 0.0780, Threshold: 0.20, Branching factor: 289, Elapsed time: 778,
Run: 1, Iteration: 1, Silhouette Score: 0.1034, Threshold: 0.20, Branching factor: 297, Elapsed time: 928,
Run: 1, Iteration: 2, Silhouette Score: 0.1248, Threshold: 0.25, Branching factor: 297, Elapsed time: 1051,
Run: 1, Iteration: 3, Silhouette Score: 0.1251, Threshold: 0.30, Branching factor: 297, Elapsed time: 1162,
Run: 1, Iteration: 4, Silhouette S