In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset_1 = pd.read_csv('data/dataset1_preprocessing.csv') 
dataset_1.head()

Unnamed: 0,title,description,content,length,article
0,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,WASHINGTON (Reuters) - The National Transporta...,578,"['ntsb', 'says', 'autopilot', 'engaged', 'cali..."
1,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,The States jobless rate fell to 5.2 per cent l...,387,"['unemployment', 'falls', 'post', 'crash', 'low']"
2,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,Louise Kennedy is showing off her autumn-winte...,432,"['louise', 'kennedy', 'aw2019', 'long', 'coats..."
3,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,"Han Kwang Song, the first North Korean footbal...",446,"['north', 'korean', 'footballer', 'han', 'join..."
4,'This Tender Land' is an affecting story about...,"""This Tender Land"" by William Kent Krueger is ...","""This Tender Land: a Novel"" (Atria Books), by ...",500,"['tender', 'land', 'affecting', 'story', 'grow..."


<h3>Vectorize data with tfidf</h3>

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, ngram_range=(1,2) , max_features=15000)
X = tfidf_vectorizer.fit_transform(dataset_1.article.to_list())
model = X.toarray()
print(len(model), len(model[0]))

8804 3588


In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit(model)
datapoint = pca.transform(model)

<h3>Clustering</h3>

In [5]:
kmeans_model = MiniBatchKMeans(n_clusters=4, init='k-means++', max_iter=100) 
# X = kmeans_model.fit(model_pvdm.docvecs.doctag_syn0)
X = kmeans_model.fit(model)
labels = kmeans_model.labels_.tolist()

In [6]:
kmeans_prediction = kmeans_model.fit_predict(model)

<h3>Elbow method for finding optimal K in Kmeans</h3>

In [7]:
# function returns Within-Cluster-Sum of Squared Errors (WSS) score for k values from 1 to kmax
def calculate_WSS(points, kmax):
  sse = []
  for k in range(1, kmax+1):
    kmeans = MiniBatchKMeans(n_clusters = k).fit(points)
    centroids = kmeans.cluster_centers_
    pred_clusters = kmeans.predict(points)
    curr_sse = 0
    
    # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
    for i in range(len(points)):
      curr_center = centroids[pred_clusters[i]]
      curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
      
    sse.append(curr_sse)
  return sse