In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval

# Feature selection
from lib._class.DFVarianceThreshold import DFVarianceThreshold
from lib._class.DFVIFThreshold import DFVIFThreshold

# Feature encoding
from lib._class.DFLabelEncoder import DFLabelEncoder
from lib._class.DFOneHotEncoder import DFOneHotEncoder

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

# Feature extraction
from lib._class.DFIvis import DFIvis

# Clustering
from lib._class.DFKMeans import DFKMeans
from lib._class.DFKMedoids import DFKMedoids
from lib._class.DFGaussianMixture import DFGaussianMixture
from lib._class.DFAgglomerative import DFAgglomerative
from lib._class.DFDBSCAN import DFDBSCAN

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-learn
from sklearn.pipeline import Pipeline

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/movie/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/danielgrijalvas/movies#movies.csv
- budget: the budget of a movie. Some movies don't have this, so it appears as 0
- company: the production company
- country: country of origin
- director: the director
- genre: main genre of the movie.
- gross: revenue of the movie
- name: name of the movie
- rating: rating of the movie (R, PG, etc.)
- released: release date (YYYY-MM-DD)
- runtime: duration of the movie
- score: IMDb user rating
- votes: number of user votes
- star: main actor/actress
- writer: writer of the movie
- year: year of release

In [None]:
data_df = pd.read_csv(f'{SOURCE_PATH_DATA}movies.csv', sep=',', encoding='latin-1',
                      parse_dates=['released'])

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

In [None]:
# Standardize letter case
for column in data_df.select_dtypes(include='object').columns:
    data_df[column] = data_df[column].str.lower()

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=3,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 1500},
             str_length=5)

# Phase 2 - Data Preparation
- Handle high cardinality features

In [None]:
vp.faststat(data_df.select_dtypes(include='object'))

###### Company

In [None]:
vp.value_count(data_df, 'company')

In [None]:
# Select companies occurs at least 100 times
count_df = data_df['company'].value_counts().to_frame(name='count')
data_df['company'] = np.where(data_df['company'].isin(count_df[count_df['count'] >= 100].index),
                              data_df['company'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'company')

###### Country

In [None]:
vp.value_count(data_df, 'country')

In [None]:
# Select countries occurs at least 20 times
count_df = data_df['country'].value_counts().to_frame(name='count')
data_df['country'] = np.where(data_df['country'].isin(count_df[count_df['count'] >= 20].index),
                              data_df['country'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'country')

###### Director

In [None]:
vp.value_count(data_df, 'director')

In [None]:
# Select directors occurs at least 15 times
count_df = data_df['director'].value_counts().to_frame(name='count')
data_df['director'] = np.where(data_df['director'].isin(count_df[count_df['count'] >= 15].index),
                               data_df['director'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'director')

###### Genre

In [None]:
vp.value_count(data_df, 'genre')

In [None]:
# Select genre occurs at least 10 times
count_df = data_df['genre'].value_counts().to_frame(name='count')
data_df['genre'] = np.where(data_df['genre'].isin(count_df[count_df['count'] >= 10].index),
                            data_df['genre'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'genre')

###### Name

In [None]:
vp.value_count(data_df, 'name')

In [None]:
# Remove movie name as most of them are unique, and it's more suitable for text mining
data_df.drop(columns=['name'], inplace=True)

###### Rating

In [None]:
vp.value_count(data_df, 'rating')

In [None]:
# Reference:
# - https://en.wikipedia.org/wiki/Motion_Picture_Association_of_America_film_rating_system
# - https://rating-system.fandom.com/wiki/Mexico_movie_rating_system
# - https://en.wikipedia.org/wiki/TV_Parental_Guidelines
data_df['rating'] = np.where(data_df['rating'].isin(['not rated', 'unrated']), 'nr/ur', data_df['rating'])
data_df['rating'] = np.where(data_df['rating'].isin(['pg', 'tv-pg']), 'pg/tv-pg', data_df['rating'])
data_df['rating'] = np.where(data_df['rating'].isin(['pg-13', 'tv-14']), 'pg-13/tv-14', data_df['rating'])
data_df['rating'] = np.where(data_df['rating'].isin(['nc-17', 'tv-ma']), 'nc-17/tv-ma', data_df['rating'])
data_df['rating'] = np.where(data_df['rating'].isin(['r', 'b', 'b15']), 'r/b/b15', data_df['rating'])

In [None]:
vp.value_count(data_df, 'rating')

###### Star

In [None]:
vp.value_count(data_df, 'star')

In [None]:
# Select stars occurs at least 25 times
count_df = data_df['star'].value_counts().to_frame(name='count')
data_df['star'] = np.where(data_df['star'].isin(count_df[count_df['count'] >= 25].index),
                           data_df['star'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'star')

###### Writer

In [None]:
vp.value_count(data_df, 'writer')

In [None]:
# Select writers occurs at least 10 times
count_df = data_df['writer'].value_counts().to_frame(name='count')
data_df['writer'] = np.where(data_df['writer'].isin(count_df[count_df['count'] >= 10].index),
                             data_df['writer'], 'other')
del count_df

In [None]:
vp.value_count(data_df, 'writer')

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=3,
             title='Phase 2 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 1500},
             str_length=5)

# Phase 3 - Data Preparation
- Feature engineering

In [None]:
# Feature enginnering from date
data_df['release_year']  = data_df['released'].dt.year
data_df['release_month'] = data_df['released'].dt.month
data_df['release_day']   = data_df['released'].dt.day
data_df['day_of_year']   = data_df['released'].dt.dayofyear
data_df['day_of_week']   = data_df['released'].dt.dayofweek
data_df['week_of_year']  = data_df['released'].dt.weekofyear
data_df['quarter']       = data_df['released'].dt.quarter

# Remove date feature
data_df.drop(columns=['released'], inplace=True)

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=3,
             title='Phase 3 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 2000},
             str_length=5)

# Phase 4 - Data Preparation
- Remove duplication
- Feature selection

In [None]:
X = data_df.copy()

X.shape

In [None]:
# Remove duplicated data
duplicate_removal = DFDuplicateRemoval()
X = duplicate_removal.fit_transform(X)

X.shape

In [None]:
duplicate_removal.duplicate_df

In [None]:
# Remove low variance feature
label_encoder      = DFLabelEncoder(columns=X.select_dtypes(include='object').columns)
variance_threshold = DFVarianceThreshold(threshold=.01)

steps = [
    ('label_encoder', label_encoder),
    ('variance_threshold', variance_threshold),
]
Pipeline(steps).fit(X)

# Prevent inverse_transform which cause loss of original data type
X.drop(columns=variance_threshold.stat_df[~variance_threshold.stat_df['support']]['feature'], inplace=True)

In [None]:
variance_threshold.stat_df[~variance_threshold.stat_df['support']]

In [None]:
CATEGORICAL_FEATURES = list(X.select_dtypes(include='object').columns)
NUMERICAL_FEATURES   = list(X.select_dtypes(include='number').columns)

CATEGORICAL_FEATURES, NUMERICAL_FEATURES

In [None]:
# Remove high VIF feature
onehot_encoder  = DFOneHotEncoder(columns=CATEGORICAL_FEATURES, dtype='byte')
standard_scaler = DFStandardScaler(columns=NUMERICAL_FEATURES)
minmax_scaler   = DFMinMaxScaler(columns=NUMERICAL_FEATURES)
vif_threshold   = DFVIFThreshold(show_progress=True)

steps = [
    ('onehot_encoder', onehot_encoder),
    ('standard_scaler', standard_scaler),
    ('minmax_scaler', minmax_scaler),
    ('vif_threshold', vif_threshold),
]
X = Pipeline(steps).fit_transform(X)

X.shape

In [None]:
vp.faststat(X)

###### Bar

In [None]:
vif_df = vif_threshold.calc_vif(X)
vif_df.reset_index(inplace=True)
vif_df.rename(columns={'index': 'feature'}, inplace=True)
vif_df = vif_df[vif_df['feature'] != 'const'].copy()

fig = px.bar(vif_df, x='feature', y='VIF')
vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename='Phase 4 - Bar - VIF')
del vif_df

# Phase 5 - Data Preparation
- Feature extraction

In [None]:
ivis    = DFIvis(k=15, n_epochs_without_progress=20, model='maaten', verbose=2, epochs=100)
ivis_df = ivis.fit_transform(X)

ivis_df.shape

In [None]:
vp.faststat(ivis_df)

###### Scatter

In [None]:
vp.scatter(ivis_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           max_col=1,
           title='Phase 5 - Scatter',
           out_path=OUT_PATH_GRAPH)

# Phase 6 - Clustering
- K-Means

In [None]:
# Determine number of clusters
kmeans = DFKMeans(cluster_name='KMeans', n_clusters=15, random_state=0, n_jobs=-1,
                  eval_inertia=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
kmeans.fit(ivis_df)

###### Line

In [None]:
vp.line(kmeans.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['inertia', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 6 - N Cluster - K-Means',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
kmeans.eval_df.loc[kmeans.eval_df['silhouette'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
kmeans     = DFKMeans(cluster_name='KMeans', n_clusters=2, random_state=0, n_jobs=-1)
cluster_df = kmeans.fit_predict(ivis_df)

vp.value_count(cluster_df, 'KMeans')

###### Scatter

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='KMeans', inplace=True)
tmp_df['KMeans'] = tmp_df['KMeans'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           color='KMeans',
           max_col=1,
           title='Phase 6 - Scatter - K-Means',
           out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['KMeans']], data_df], axis=1)
tmp_df['KMeans'] = tmp_df['KMeans'].astype(str)

vp.box(tmp_df,
       color='KMeans',
       max_col=2,
       title='Phase 6 - Box - K-Means',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'showlegend': False,
           'height': 1500
       })

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['KMeans']], data_df], axis=1)

vp.distmat(tmp_df,
           target='KMeans',
           title='Phase 6 - Distribution Matrix - K-Means',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 7 - Clustering
- Gaussian mixtures

In [None]:
# Determine number of clusters
gmm = DFGaussianMixture(cluster_name='GMM', n_components=15, random_state=0,
                        eval_aic=True, eval_bic=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
gmm.fit(ivis_df)

###### Line

In [None]:
vp.line(gmm.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['akaike', 'bayesian', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=3,
        title='Phase 7 - N Cluster - GMM',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
gmm.eval_df.loc[gmm.eval_df['akaike'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['bayesian'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['silhouette'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
gmm        = DFGaussianMixture(cluster_name='GMM', n_components=2, random_state=0)
cluster_df = gmm.fit_predict(ivis_df)

vp.value_count(cluster_df, 'GMM')

###### Scatter

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='GMM', inplace=True)
tmp_df['GMM'] = tmp_df['GMM'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           color='GMM',
           max_col=1,
           title='Phase 7 - Scatter - GMM',
           out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['GMM']], data_df], axis=1)
tmp_df['GMM'] = tmp_df['GMM'].astype(str)

vp.box(tmp_df,
       color='GMM',
       max_col=2,
       title='Phase 7 - Box - GMM',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'showlegend': False,
           'height': 1500
       })

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['GMM']], data_df], axis=1)

vp.distmat(tmp_df,
           target='GMM',
           title='Phase 7 - Distribution Matrix - GMM',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 8 - Clustering
- Agglomerative

In [None]:
# Reference: https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
# Determine number of clusters
agglo = DFAgglomerative(cluster_name='Agglo', n_clusters=15, random_state=0,
                        eval_silhouette=True, eval_chi=True, eval_dbi=True)
agglo.fit(ivis_df)

###### Line

In [None]:
vp.line(agglo.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 8 - N Cluster - Agglomerative',
        out_path=OUT_PATH_GRAPH)

###### Dendrogram

In [None]:
vp.dendrogram(ivis_df,
              title='Phase 8 - Dendrogram - Agglomerative',
              out_path=OUT_PATH_GRAPH,
              layout_kwargs={
                  'width': 1350,
                  'height': 600
              })

In [None]:
# Determine number of clusters by scores
agglo.eval_df.loc[agglo.eval_df['silhouette'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
agglo      = DFAgglomerative(cluster_name='Agglo', n_clusters=2, random_state=0)
cluster_df = agglo.fit_predict(ivis_df)

vp.value_count(cluster_df, 'Agglo')

###### Scatter

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='Agglo', inplace=True)
tmp_df['Agglo'] = tmp_df['Agglo'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           color='Agglo',
           max_col=1,
           title='Phase 8 - Scatter - Agglomerative',
           out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['Agglo']], data_df], axis=1)
tmp_df['Agglo'] = tmp_df['Agglo'].astype(str)

vp.box(tmp_df,
       color='Agglo',
       max_col=2,
       title='Phase 8 - Box - Agglomerative',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'showlegend': False,
           'height': 1500
       })

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['Agglo']], data_df], axis=1)

vp.distmat(tmp_df,
           target='Agglo',
           title='Phase 8 - Distribution Matrix - Agglomerative',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 9 - Clustering
- DBSCAN

In [None]:
# Determine hyperparameters with highest score
dbscan = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1,
                  eps_samples_tuples=[(round(x,5), 5) for x in np.arange(.1, 2, .1)],
                  eval_cluster=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
dbscan.fit(ivis_df)

###### Line

In [None]:
vp.line(dbscan.eval_df,
        xy_tuples=[('eps', x) for x in ['n_cluster', 'n_noise',
                                        'silhouette', 'silhouette_w/o_noise',
                                        'calinski_harabasz', 'calinski_harabasz_w/o_noise',
                                        'davies_bouldin', 'davies_bouldin_w/o_noise']],
        max_col=4,
        title='Phase 9 - EPS Evaluation - DBSCAN',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine EPS by scores
dbscan.eval_df.loc[dbscan.eval_df['silhouette'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['silhouette_w/o_noise'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['calinski_harabasz'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['calinski_harabasz_w/o_noise'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['davies_bouldin'].idxmin()]['eps'], \
dbscan.eval_df.loc[dbscan.eval_df['davies_bouldin_w/o_noise'].idxmin()]['eps']

In [None]:
# Clustering
dbscan     = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1, eps=1.5, min_samples=5)
cluster_df = dbscan.fit_predict(ivis_df)

vp.value_count(cluster_df, 'DBSCAN')

###### Scatter

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='DBSCAN', inplace=True)
tmp_df['DBSCAN'] = tmp_df['DBSCAN'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           color='DBSCAN',
           max_col=1,
           title='Phase 9 - Scatter - DBSCAN',
           out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['DBSCAN']], data_df], axis=1)
tmp_df['DBSCAN'] = tmp_df['DBSCAN'].astype(str)

vp.box(tmp_df,
       color='DBSCAN',
       max_col=2,
       title='Phase 9 - Box - DBSCAN',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'showlegend': False,
           'height': 1500
       })

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['DBSCAN']], data_df], axis=1)

vp.distmat(tmp_df,
           target='DBSCAN',
           title='Phase 9 - Distribution Matrix - DBSCAN',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 10 - Clustering
- K-Medoids

In [None]:
# Determine number of clusters
kmedoids = DFKMedoids(cluster_name='KMedoids', n_clusters=15, random_state=0,
                      eval_inertia=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
kmedoids.fit(ivis_df)

###### Line

In [None]:
vp.line(kmedoids.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['inertia', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 10 - N Cluster - K-Medoids',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
kmedoids.eval_df.loc[kmedoids.eval_df['silhouette'].idxmax()]['n_cluster'],\
kmedoids.eval_df.loc[kmedoids.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
kmedoids.eval_df.loc[kmedoids.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
kmedoids   = DFKMeans(cluster_name='KMedoids', n_clusters=6, random_state=0, n_jobs=-1)
cluster_df = kmedoids.fit_predict(ivis_df)

vp.value_count(cluster_df, 'KMedoids')

###### Scatter

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='KMedoids', inplace=True)
tmp_df['KMedoids'] = tmp_df['KMedoids'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', 'ivis_1')],
           color='KMedoids',
           max_col=1,
           title='Phase 10 - Scatter - K-Medoids',
           out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['KMedoids']], data_df], axis=1)
tmp_df['KMedoids'] = tmp_df['KMedoids'].astype(str)

vp.box(tmp_df,
       color='KMedoids',
       max_col=2,
       title='Phase 10 - Box - K-Medoids',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'showlegend': False,
           'height': 1500
       })

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['KMedoids']], data_df], axis=1)

vp.distmat(tmp_df,
           target='KMedoids',
           title='Phase 10 - Distribution Matrix - K-Medoids',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df