In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
movies = pd.read_csv('movies_metadata_withcomponents.csv')

In [12]:
# Drop the remaining NaN values
# movies.dropna(inplace = True)
# movies.to_csv('movies_metadata_withcomponents.csv', index=False)

In [3]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

In [4]:
# Convert these to arrays
country_codes = pd.read_csv('country_codes.csv')

movies['genres'] = movies['genres'].apply(lambda str: get_numbers(str))
movies['production_companies'] = movies['production_companies'].apply(lambda str: get_numbers(str))
movies['production_countries'] = movies['production_countries'].apply(lambda str: get_country_codes(str)) # This takes time

In [5]:
# Get dummies
mlb = MultiLabelBinarizer()
genres = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes)

mlb = MultiLabelBinarizer()
production_companies = pd.DataFrame(mlb.fit_transform(movies['production_companies']), columns=mlb.classes)

mlb = MultiLabelBinarizer()
production_countries = pd.DataFrame(mlb.fit_transform(movies['production_countries']), columns=mlb.classes)

original_language = pd.get_dummies(movies['original_language'])

In [6]:
# Rename the columns to unique names
for i in range(genres.shape[1]):
    genres.rename(mapper={i : str(i) + '. genre'}, axis=1, inplace=True)
    
for i in range(production_countries.shape[1]):
    production_countries.rename(mapper={i : str(i) + '. country'}, axis=1, inplace=True)

In [7]:
# production_companies is not used in this version
movievectors = movies.drop(['genres', 'production_companies', 'production_countries', 'original_language', 'id', 'title'], axis=1).join(
        genres).join(production_countries).join(original_language)

In [8]:
# Kmeans
kmeans = KMeans(n_clusters=30)
kmeans.fit(movievectors.dropna())

KMeans(n_clusters=30)

In [9]:
kmeans.cluster_centers_

array([[ 1.00000000e+00,  3.66121109e+07,  7.90743149e+00, ...,
         2.60208521e-18,  1.91570881e-02, -5.42101086e-20],
       [ 1.00000000e+00,  1.57432432e+08,  1.65881801e+01, ...,
        -1.08420217e-19,  1.73472348e-18, -3.38813179e-21],
       [ 1.00000000e+00,  1.88268797e+05,  1.94390182e+00, ...,
         8.04609635e-04,  9.05834718e-03,  2.59551495e-05],
       ...,
       [ 1.00000000e+00,  5.89583333e+07,  4.02543072e+01, ...,
         0.00000000e+00,  0.00000000e+00,  3.38813179e-21],
       [ 1.00000000e+00,  8.82459375e+07,  1.71402397e+01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.00000000e+00,  1.49986977e+07,  1.00625638e+01, ...,
         1.62630326e-18,  1.74563591e-02, -3.72694497e-20]])