In [1]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
import json
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [3]:
with open('mixed.json') as f:
    data = json.load(f)

In [4]:
df = pd.DataFrame(data['results'])

In [5]:
df["d"]=df["d"]+" "+df["t"]
df["d"]=df["d"].str.lower()

In [6]:
df["d"]

0      the best of films and movie trailers. pick you...
1      duration:     3:11    posted:     3 days ago  ...
2      all new movie trailers are here! don't miss th...
3      duration:     8:33    posted:     26-feb-2012 ...
4      duration:     1:31:46    posted:     1 day ago...
                             ...                        
193    3 months free - offer terms & conditions: this...
194    we have over 15 years worth of information, ex...
195    music, film, tv and political news coverage. m...
196    get up to the minute news and reviews for all ...
197    explore the fundamentals of music via ableton'...
Name: d, Length: 198, dtype: object

In [7]:
en_sw = stopwords.words('english')

def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_sw:
            result.append(token)
            
    return result

def remove_pun(text):
    
    tokenizer = RegexpTokenizer(r"[a-zA-Z]{2,}")
    lst=tokenizer.tokenize(' '.join(text))
    return lst

In [8]:
df["d"]=df["d"].apply(word_tokenize)
df["d"]=df["d"].apply(remove_stopwords)
df["d"]=df["d"].apply(remove_pun)

In [9]:
print(df["d"].head())
processed_data=df["d"]

0    [best, films, movie, trailers, pick, favorite,...
1    [duration, posted, days, ago, video, marvel, s...
2    [new, movie, trailers, miss, latest, movie, tr...
3    [duration, posted, feb, video, movie, movie, y...
4    [duration, posted, day, ago, video, tippu, hin...
Name: d, dtype: object


In [12]:
import gensim.downloader as api

In [13]:
model = api.load('word2vec-google-news-300')



In [14]:
import pickle
file = open('model_pretrained', 'wb')
pickle.dump(model, file)
file.close()

In [None]:
### vectorise documents

In [15]:
def get_vectors(data,model):
    features=[]
    
    for tokens in data:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model:
                vectors.append(model[token])
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [16]:
vectorized_docs = get_vectors(processed_data, model=model)

In [17]:
print(vectorized_docs[:3])

[array([ 0.0442627 ,  0.00627136, -0.00957031,  0.13442154, -0.03109741,
        0.00175209,  0.05312805, -0.02211304,  0.10891113, -0.01000671,
       -0.01262512, -0.13760376,  0.01807098,  0.00377674, -0.08588753,
        0.09996338,  0.07568817,  0.04654389, -0.01003265, -0.0955101 ,
        0.07974014,  0.07895432, -0.00938416,  0.09025421,  0.00799561,
       -0.02875519, -0.06091101,  0.10482559,  0.08783264, -0.09823914,
       -0.1409668 ,  0.03922733, -0.01544495, -0.00385132,  0.04980965,
       -0.05741272,  0.11453857,  0.02060547,  0.00414124,  0.05859985,
        0.11910401, -0.08938599,  0.08264466,  0.08771362, -0.03858338,
       -0.05410156,  0.03924561,  0.00602875,  0.03953476, -0.05562592,
       -0.06053619, -0.05065002, -0.01528091,  0.07638998, -0.03615723,
       -0.06508446,  0.00566101, -0.01993561, -0.0006218 , -0.00879402,
        0.0728241 ,  0.07418518, -0.11555481, -0.03594971, -0.08305969,
       -0.01700287, -0.13304444,  0.00856628,  0.10038223,  0.0

In [18]:
def train_model(k,X):
    km=KMeans(n_clusters=k).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette score: {silhouette_score(X, km.labels_):0.2f}")

In [19]:
for i in range(2,50,2):
    train_model(i,vectorized_docs)

For n_clusters = 2
Silhouette score: 0.13
For n_clusters = 4
Silhouette score: 0.14
For n_clusters = 6
Silhouette score: 0.12
For n_clusters = 8
Silhouette score: 0.11
For n_clusters = 10
Silhouette score: 0.11
For n_clusters = 12
Silhouette score: 0.12
For n_clusters = 14
Silhouette score: 0.09
For n_clusters = 16
Silhouette score: 0.11
For n_clusters = 18
Silhouette score: 0.09
For n_clusters = 20
Silhouette score: 0.10
For n_clusters = 22
Silhouette score: 0.12
For n_clusters = 24
Silhouette score: 0.10
For n_clusters = 26
Silhouette score: 0.10
For n_clusters = 28
Silhouette score: 0.12
For n_clusters = 30
Silhouette score: 0.10
For n_clusters = 32
Silhouette score: 0.11
For n_clusters = 34
Silhouette score: 0.11
For n_clusters = 36
Silhouette score: 0.11
For n_clusters = 38
Silhouette score: 0.11
For n_clusters = 40
Silhouette score: 0.11
For n_clusters = 42
Silhouette score: 0.12
For n_clusters = 44
Silhouette score: 0.12
For n_clusters = 46
Silhouette score: 0.12
For n_clusters 