In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# 0. Importing CSV

In [4]:
streaming_platforms = pd.read_csv("./Documents/STREAMING_PLATFORMS.csv")

# 1. Reviews cleaning

In [6]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load('en')
parser = English()

In [31]:
import re

def spacy_tokenizer(sentence):

    tokens = parser(sentence)
    
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        
        if lemma not in STOP_WORDS and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)

    return filtered_tokens

In [8]:
spacy_tokenizer(streaming_platforms["First Review"][1])[:15]

['sci',
 'fi',
 'fan',
 'know',
 'westworld',
 'base',
 'feature',
 'film',
 'write',
 'direct',
 'michael',
 'crichton',
 'premise',
 'basically',
 'future']

# 2. TD-IDF

In [25]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.5, tokenizer=spacy_tokenizer)

tfidf_matrix = tfidf_vectorizer.fit_transform(streaming_platforms["First Review"])

tfidf_matrix.shape

(162, 4)

In [26]:
terms = tfidf_vectorizer.get_feature_names()

terms

['character', 'good', 'like', 'watch']

# 3. Clustering

# Kmeans

In [27]:
true_k = 5
model = KMeans(n_clusters=true_k, init='random', max_iter=100, n_init=1)
model.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=100, n_clusters=5,
       n_init=1, n_jobs=None, precompute_distances='auto', random_state=None,
       tol=0.0001, verbose=0)

In [28]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()

In [29]:
order_centroids

array([[1, 2, 0, 3],
       [2, 3, 1, 0],
       [1, 3, 2, 0],
       [0, 3, 2, 1],
       [3, 1, 2, 0]])

In [30]:
for i in range(true_k):
    print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
    print(" %s" % terms[ind])

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:
 watch
 good
 like
 character
