## Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('headlines.csv', header=None)
df

In [None]:
print(df.iloc[1,:].values[0])

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [None]:
swords = stopwords.words('english')
wnl = WordNetLemmatizer()

In [None]:
def lemmatize(word):
    tag = pos_tag([word])
    pos='n'
    if tag[0][1].startswith('N'):
        pos = 'n'
    elif tag[0][1].startswith('V'):
        pos = 'v'
    elif tag[0][1].startswith('R'):
        pos = 'r'
    elif tag[0][1].startswith('J'):
        pos = 'a'
    
    return wnl.lemmatize(word, pos=pos)

In [None]:
lemmatize('the')

In [None]:
def clean_txt(sent):
    tokens1 = word_tokenize(sent)  # Step-1. Tokenize the text
    tokens2 = [token for token in tokens1 if token.isalnum()]  # Step-2 Remove the punctuations
    tokens3 = [token.lower() for token in tokens2 if token.lower() not in swords]  # Step-3 Remove stopwords
    tokens4 = [lemmatize(token) for token in tokens3]  # step-4 Remove the suffixes
    return tokens4 

In [None]:
sent = 'Hello friends! How are you? I like Python Programming.'

In [None]:
clean_txt(sent)

## TF*IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf = TfidfVectorizer(analyzer=clean_txt)

In [None]:
X_new = tf.fit_transform(df[0])

In [None]:
X_new

## Create the clusters

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(random_state=0)

In [None]:
km.fit_predict(X_new)

In [None]:
set(km.labels_)

In [None]:
km.inertia_   # Sum of squared errors 

## Elbow Method

In [None]:
sse = []
for k in range(1,16):
    km = KMeans(n_clusters=k,random_state=0)
    km.fit_predict(X_new)
    sse.append(km.inertia_)

In [None]:
plt.xlabel('Value of K')
plt.ylabel('SSE')
plt.grid()
plt.xticks(range(1, 16))
plt.plot(range(1,16),sse,marker='o',color='r')

In [None]:
km = KMeans(n_clusters=5, random_state=0, n_init='auto')

In [None]:
labels = km.fit_predict(X_new)

In [None]:
labels

## Seperate the clusters

In [None]:
zero = df[labels == 0]
one = df[labels == 1]
two = df[labels == 2]
three = df[labels == 3]
four = df[labels == 4]

In [None]:
zero

In [None]:
print("'Cluster-0:", len(zero))
print("'Cluster-1:", len(one))
print("'Cluster-2:", len(two))
print("'Cluster-3:", len(three))
print("'Cluster-4:", len(four))

## Wordcloud

### First Cluster

In [None]:
from wordcloud import WordCloud

In [None]:
wc = WordCloud().generate(' '.join(one[0]))
plt.figure(figsize=(16,9))
plt.title('First Cluster')
plt.imshow(wc);

In [None]:
swords = stopwords.words('english')

In [None]:
from nltk import FreqDist
new = ' '.join(one[0])
clean = [x for x in word_tokenize(new) if x.lower() not in swords and x.isalpha()]
freq = FreqDist(clean)
freq.most_common(10)

### Second cluster

In [None]:
wc = WordCloud().generate(' '.join(two[0]))
plt.figure(figsize=(16,9))
plt.title('First Cluster')
plt.imshow(wc);

In [None]:
new = ' '.join(two[0])
clean = [x for x in word_tokenize(new) if x.lower() not in swords and x.isalpha()]
freq = FreqDist(clean)
freq.most_common(10)

### Zeroth Cluster

In [None]:
wc = WordCloud().generate(' '.join(zero[0]))
plt.figure(figsize=(16,9))
plt.title('First Cluster')
plt.imshow(wc);

In [None]:
new = ' '.join(zero[0])
clean = [x for x in word_tokenize(new) if x.lower() not in swords and x.isalpha()]
freq = FreqDist(clean)
freq.most_common(10)

###  Third Cluster

In [None]:
wc = WordCloud().generate(' '.join(three[0]))
plt.figure(figsize=(16,9))
plt.title('First Cluster')
plt.imshow(wc);

In [None]:
new = ' '.join(three[0])
clean = [x for x in word_tokenize(new) if x.lower() not in swords and x.isalpha()]
freq = FreqDist(clean)
freq.most_common(10)

### Fourth Cluster

In [None]:
wc = WordCloud().generate(' '.join(four[0]))
plt.figure(figsize=(16,9))
plt.title('First Cluster')
plt.imshow(wc);

In [None]:
new = ' '.join(four[0])
clean = [x for x in word_tokenize(new) if x.lower() not in swords and x.isalpha()]
freq = FreqDist(clean)
freq.most_common(10)