## Latent Semantic Analysis

### Importing Libraries

Imorting useful libraries for the LSA.

In [21]:
import gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, MiniBatchKMeans
import pickle

### Loading Dataset

Loading our collected news headlines dataset

In [2]:
data = pd.read_csv('C:/Users/Muzammil/Desktop/TA_Assignment_1/TA_Data_Collection/sentences.csv')
data.shape

(3328, 2)

In [3]:
data.tail()

Unnamed: 0,sentence,Text
3323,"India’s forex reserves increase, stand at $562...",MUMBAI: India’s foreign exchange reserves rose...
3324,"Ford to cut 1,100 jobs in Spain","MADRID: U.S. auto maker Ford plans to slash 1,..."
3325,Sri Lankan shares snap 6-day rally as financia...,"Sri Lankan shares closed lower on Friday, afte..."
3326,"NY cocoa to fall to $2,692",SINGAPORE: New York May cocoa is expected to b...
3327,Banks drag FTSE 100 to 1-month low,London’s blue-chip FTSE 100 index fell on Frid...


Calculating TF-IDF Vectorizer on the data.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english',token_pattern=r'(?u)\b[A-Za-z]+\b')
TF_IDF_Vectorizer = tfidf_vectorizer.fit_transform(data.sentence).toarray()

print(TF_IDF_Vectorizer)
print(tfidf_vectorizer.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['aa' 'aafrinish' 'aaj' ... 'zte' 'zu' 'zuckerberg']


Now Applying truncated SVD on the TF-IDF Vector which is calculated above.

In [24]:
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=300, n_iter=100)
corpus = lsa.fit_transform(TF_IDF_Vectorizer)
corpus.shape

(3328, 300)

Input random text to check out sentence similarities with the given text.

In [6]:
input_text = tfidf_vectorizer.transform(['pemra bans airing of imran khan’s speeches']).toarray()
input_corpus = lsa.transform(input_text)

In [7]:
np.array(input_corpus).shape

(1, 50)

### Cosine Similarity

Calculating Cosine similarities between the input text embedding and data sentence embeddings.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cos_similarities = cosine_similarity(np.array(input_corpus) , np.array(corpus))
print(cos_similarities)
print(cos_similarities.max())
print(cos_similarities[0].argmax())
print(cos_similarities[0].max())
s= cos_similarities[0].argsort()[-5:][::-1]
print(s)

[[ 1.          0.81338713  0.89931404 ...  0.02027968 -0.03407232
  -0.04502325]]
1.0000000000000002
0
1.0000000000000002
[  0 140 648 499 462]


Now printing top 5 results which are similar to the input text.

In [9]:
for i in s:
    print(data.sentence[i])

PEMRA bans airing of Imran Khan’s speeches
PEMRA bans airing of Imran Khan’s speeches
PEMRA bans airing of Imran Khan’s speeches: Gagged
Pemra bans Imran’s speeches, again
Imran Khan moves LHC against Pemra’s ban on broadcast of his speeches


## Clustering 

### K Mean Clustering

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, kmeans.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.16970633377069289


In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, kmeans.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.5297378545765987


In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, kmeans.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.18454114300211424


### MiniBatch K-Mean Clustering

In [13]:
mb_km = MiniBatchKMeans(n_clusters=5)
mb_km.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, mb_km.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.15167897129292182


In [14]:
mb_km = MiniBatchKMeans(n_clusters=3)
mb_km.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, mb_km.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.19250160636701083


In [15]:
mb_km = MiniBatchKMeans(n_clusters=4)
mb_km.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, mb_km.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.16414340383859527


In [16]:
mb_km = MiniBatchKMeans(n_clusters=5)
mb_km.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, mb_km.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.1936359678685607


### Agglomarative Clustering

In [17]:
aglo_cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
aglo_cluster.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, aglo_cluster.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.5368323155738473


In [18]:
aglo_cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
aglo_cluster.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, aglo_cluster.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.5476116479984486


In [19]:
aglo_cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
aglo_cluster.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, aglo_cluster.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.5412118520484813


In [20]:
aglo_cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
aglo_cluster.fit(corpus)

# calculate the silhouette score to evaluate the quality of clustering
silhouette_avg = silhouette_score(corpus, aglo_cluster.labels_)
print(f"Silhouette score: {silhouette_avg}")

Silhouette score: 0.5228862592712163
