In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score

import numpy as np
import random

import seaborn as sns
import matplotlib.pyplot as plt
import time

import warnings
warnings.filterwarnings("error")

In [2]:
movies_TFIDF = pd.read_csv('3a movies_TFIDF.csv')
movies_GloVe = pd.read_csv('3b movies_GloVe.csv')
country_codes = pd.read_csv('country_codes.csv')

In [3]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

def prepare_dataframe(movies):
    
    movies = movies.drop(['id', 'title'], axis=1)
    
    # Categorical values are not used, because of the algorithm
    numeric = movies.drop(['adult', 'genres', 'original_language', 'production_companies', 'production_countries'], axis=1)
    
    # Scale the numeric values to (0, 1) interval
    scaler = MinMaxScaler()
    numeric = scaler.fit_transform(numeric)
    
    movievectors = pd.DataFrame(numeric)
    
    return movievectors

def birch_optim(thres, branch):
    try:
    
        # Determining score

        birch_TFIDF = Birch(threshold=thres, branching_factor=branch, n_clusters=35)
        birch_TFIDF.fit(movies_TFIDF)

        end = time.time()

        tfscore = silhouette_score(movies_TFIDF, birch_TFIDF.predict(movies_TFIDF))

        return tfscore

    except:
        # Error
        return -2

In [4]:
movies_TFIDF = prepare_dataframe(movies_TFIDF)
movies_GloVe = prepare_dataframe(movies_GloVe)

Use silhouette score for test purposes:

In [5]:
# 1. test: TF-IDF vs GloVe model and measure runtime
thres = 0.25
df1 = pd.DataFrame(columns=['TF-IDF', 'TF-IDF runtime',
                                   'GloVe model', 'GloVe model runtime'])

# Birch
start1 = time.time()
birch_TFIDF = Birch(threshold=thres)
birch_TFIDF.fit(movies_TFIDF)
end1 = time.time()
    
start2 = time.time()
birch_GloVe = Birch(threshold=thres)
birch_GloVe.fit(movies_GloVe)
end2 = time.time()
    
tfscore = silhouette_score(movies_TFIDF, birch_TFIDF.predict(movies_TFIDF))
elapsed1 = end1-start1
    
emscore = silhouette_score(movies_GloVe, birch_GloVe.predict(movies_GloVe))
elapsed2 = end2-start2
    
scores = pd.DataFrame([[tfscore, elapsed1, emscore, elapsed2]],
                          columns=['TF-IDF', 'TF-IDF runtime',
                                   'GloVe model', 'GloVe model runtime'])
df1 = pd.concat([df1, scores], axis=0)

In [6]:
df1 = df1.reset_index().drop('index', axis=1)

In [7]:
df1

Unnamed: 0,TF-IDF,TF-IDF runtime,GloVe model,GloVe model runtime
0,0.050342,31.879574,0.124822,16.912234


In [8]:
# 2. test: Optimizing birch clustering parameters with hill-climbing method from different starting points

# Initialize variables
silscore = np.zeros(5)
e = 0.05 # epsilon
branchlistlength = 60
branchlist = np.linspace(2, 500, branchlistlength).round().astype(int)

start = time.time()

for run in range(5):
    iter = -1
    # Random point
    thres = random.randint(2, 10)/20 # 0.1, 0.15, 0.20 ... 0.45, 0.5
    branch = random.choice(branchlist)

    while True:
        iter = iter + 1

        # Determining score
        silscore[0] = birch_optim(thres, branch)

        # Determining neighbours and their scores
        if np.where(branchlist==branch)[0][0]+1 < branchlistlength:
            silscore[1] = birch_optim(thres, branchlist[np.where(branchlist==branch)[0][0]+1])
        else:
            silscore[1] = -2

        silscore[2] = birch_optim(thres+e, branch)

        if np.where(branchlist==branch)[0][0]-1 >= 0: # 0 index is the first element
            silscore[3] = birch_optim(thres, branchlist[np.where(branchlist==branch)[0][0]-1])
        else:
            silscore[3] = -2

        if thres-e >= 0.05:
            silscore[4] = birch_optim(thres-e, branch)
        else:
            silscore[4] = -2

        print('Run: {}, Iteration: {}, Silhouette Score: {:0.4f}, Threshold: {:0.2f}, Branching factor: {}, Elapsed time: {:0.0f},'
          .format(run, iter, silscore[0], thres, branch, time.time()-start))

        if silscore.max() == -2: # Error
            thres = thres-e
            continue

        maxindex = np.argmax(silscore)
        if maxindex == 0:
            break
        if maxindex == 1:
            branch = branchlist[np.where(branchlist==branch)[0][0]+1]
        elif maxindex == 2:
            thres = thres+e
        elif maxindex == 3:
            branch = branchlist[np.where(branchlist==branch)[0][0]-1]
        elif maxindex == 4:
            thres = thres-e
            
    print()

Run: 0, Iteration: 0, Silhouette Score: 0.0288, Threshold: 0.25, Branching factor: 103, Elapsed time: 308,
Run: 0, Iteration: 1, Silhouette Score: 0.0320, Threshold: 0.20, Branching factor: 103, Elapsed time: 760,
Run: 0, Iteration: 2, Silhouette Score: 0.0340, Threshold: 0.15, Branching factor: 103, Elapsed time: 1370,

Run: 1, Iteration: 0, Silhouette Score: 0.0386, Threshold: 0.40, Branching factor: 154, Elapsed time: 1498,
Run: 1, Iteration: 1, Silhouette Score: 0.0586, Threshold: 0.45, Branching factor: 154, Elapsed time: 1615,
Run: 1, Iteration: 2, Silhouette Score: 0.0754, Threshold: 0.50, Branching factor: 154, Elapsed time: 1707,

Run: 2, Iteration: 0, Silhouette Score: 0.0480, Threshold: 0.35, Branching factor: 399, Elapsed time: 1854,

Run: 3, Iteration: 0, Silhouette Score: 0.0489, Threshold: 0.35, Branching factor: 466, Elapsed time: 2002,

Run: 4, Iteration: 0, Silhouette Score: 0.0370, Threshold: 0.40, Branching factor: 500, Elapsed time: 2104,
Run: 4, Iteration: 1, Silh