In [295]:
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [296]:
data = pd.read_excel('./data/data.xlsx')
data

Unnamed: 0,Case Number,Summary
0,1249923,An appeal by special leave against the judgme...
1,68170353,Drug and Magic Remedies (Objectionable Advert...
2,779491,Criminal Appeal No. 204 of the Calcutta High ...
3,591481,"Petition Nos. 81, 62, 63 & 3 of 1959 was file..."
4,1034384,"Criminal Appeals Nos. 36-D, 37-D and 52-D of ..."
...,...,...
9196,1057848,The appellant was working as Officiating Secr...
9197,550254,The appellants and their deceased father S. B...
9198,1073551,The disputed land belonged originally to one ...
9199,998337,Jaimal was the owner of the suit properties ....



### Data cleaning :

In [297]:
remove_str = " Take notes as you read a judgment using our Virtual Legal Assistant"

In [298]:
drop_indices = data[data['Summary'].apply(lambda x : remove_str in x)].index
len(drop_indices)

382

In [299]:
data.drop(index=drop_indices,inplace=True)

In [300]:
summaries = data['Summary'].tolist()
summaries

[' An appeal by special leave against the judgment and order of the Punjab High Court dated May 17, 1956 . The appellant was a Naib-Tehsildar at Ferozepur Jhirka in the district of Gurgaon . The allegation against the appellant was that during his tours in the several villages, his son who was a Director of the Starline Pictures Ltd., a film company of Delhi, accompanied him and at the time of mutations the appellant asked the parties whose mutation he was attesting to purchase shares in his son .',
 ' Drug and Magic Remedies (Objectionable Advertisement) Act (XXI of 1954) was passed on April 30, 1954, came into force on April 1, 1955 . Petitioners in Writ Petition No. 81 of 1959, the Hamdard Dawakhana (Wakf) and another, alleged they experienced difficulty in the matter of publicity for their products .',
 ' Criminal Appeal No. 204 of the Calcutta High Court . Appeal by special leave from the judgment and order dated June 23, 1959 . The case against Kangsari Haldar and Jogendra Nath G

In [301]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [314]:
extra_stop_words = ['appeal','petition','civil','appellant','judgment']

### Adding 'court' and 'act' in stopword does not consider tenant and rent in topwords
### Sometimes, for the given examples, the algorithm is assigning wrong clusters during inference. This is because the other cluster might have more weightage for the top keyword like 'court'. This is happening because the other cluster has more data points whereas the target cluster has very less as compared to the other one (Check the number of times top words is present in each of them)
### extra_stop_words = ['appeal','petition','civil','appellant','judgment'] and num_clusters=10
### If we add 'court' to stop words, it is not considering 'tenant' and 'rent' as top words

In [315]:
dler = nltk.downloader.Downloader()

dler._update_index()
dler.download('all')
# Define the preprocessing steps
stop_words = set(stopwords.words('english')+extra_stop_words)

lemmatizer = WordNetLemmatizer()

def preprocess_summary(summary):
    # Tokenize the summary
    words = word_tokenize(summary.lower())

    # Remove stop words and punctuation
    words = [w for w in words if w.isalpha() and w not in stop_words]

    # Lemmatize the words
    words = [lemmatizer.lemmatize(w) for w in words]

    # Join the words back into a string
    preprocessed_summary = ' '.join(words)
    return preprocessed_summary

# Apply the preprocessing steps to all the summaries
preprocessed_summaries = [preprocess_summary(summary) for summary in summaries]


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    | 

[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package mte_teip5 to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package mte_teip5 is already up-to-date!
[nltk_data]    | Downloading package mwa_ppdb to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package mwa_ppdb is already up-to-date!
[nltk_data]    | Downloading package names to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Downloading package nombank.1.0 to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package nombank.1.0 is already up-to-date!
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package nonbreaking_prefixes is already up-to-date!
[nltk_data]    | Downloading package nps_chat

[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package unicode_samples is already up-to-date!
[nltk_data]    | Downloading package universal_tagset to
[nltk_data]    |     C:\Users\nsimh\AppData\Roaming\nltk_data...
[nltk_data]    |   Package universal_tagset is already up-to-date!
[nltk_data]    | Downloading package universal

In [316]:
# preprocessed_summaries


Try with different max_features

Things we can play around with  - max_features for vectorizing, different vectorization methods and different grouping methods

Find the right k for K-means clustering

In [317]:
NO_OF_CLUSTERS = 10
all_stop_words= stopwords.words('english')+extra_stop_words

In [318]:
#Hyperparamater max_features caps the dimentsion
#for 3680 datasamples - the dim of each sample is 9961
#as no of samples increases for training, the dim of each also increases

### ************************************ START OF JUGAD

In [319]:
new_doc = "Mr.Anand commmited crime that led to the disqualification of him where he murdered him in broad day light for which he was convicted and sentenced"
new_doc = "He and his brother are having dispute over the land they own with the workmen and labours"
new_doc = " The impugned Acts infringe the fundamental rights under Art. (1) of the petitioners who are butchers, gut merchants, curers and cattle dealers to carry on their respective trades . They also contravene the religious practice of the Petitioners' community to sacrifice cows, cattle and sheep and goats  "

In [347]:
new_doc = "He was arrested on charges of corruption"
new_doc = "The industrial men harrased the labour so an act was passed"
new_doc = "He did not pay tax"
new_doc = "Landlord high tenant rent "
new_doc = "The landlord sued the tenant for not paying rent on time"


In [348]:
new_doc_preprocess = preprocess_summary(new_doc)
preprocessed_summaries.append(new_doc_preprocess)
new_doc_preprocess

'landlord sued tenant paying rent time'

In [349]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the preprocessed summaries

vectorized_summaries = vectorizer.fit_transform(preprocessed_summaries)

kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, random_state=42)

# Apply the K-Means algorithm to the vectorized summaries
cluster_labels = kmeans.fit_predict(vectorized_summaries)

# Evaluate the clustering results using the silhouette score
silhouette_avg = silhouette_score(vectorized_summaries, cluster_labels)
print(f'Silhouette score: {silhouette_avg:.2f} ')



Silhouette score: 0.01 


In [350]:
vectorized_summaries

<8820x17033 sparse matrix of type '<class 'numpy.float64'>'
	with 213199 stored elements in Compressed Sparse Row format>

In [351]:
print(cluster_labels[-1])
cluster_labels=cluster_labels[:-1]
preprocessed_summaries = preprocessed_summaries[:-1]

4


 ### **************************************** END OF JUGAD

In [325]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the preprocessed summaries

vectorized_summaries = vectorizer.fit_transform(preprocessed_summaries)

kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, random_state=42)

# Apply the K-Means algorithm to the vectorized summaries
cluster_labels = kmeans.fit_predict(vectorized_summaries)

# Evaluate the clustering results using the silhouette score
silhouette_avg = silhouette_score(vectorized_summaries, cluster_labels)
print(f'Silhouette score: {silhouette_avg:.2f} ')



Silhouette score: 0.01 


In [326]:
vectorized_summaries[0]

<1x17033 sparse matrix of type '<class 'numpy.float64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [327]:
new_document='Mr.Anand commmited crime that led to the disqualification of him where he murdered him in broad day light'

In [328]:
new_doc_preprocess = preprocess_summary(new_document)
new_doc_preprocess

'commmited crime led disqualification murdered broad day light'

In [329]:
# vectorizer = TfidfVectorizer(stop_words="english")
new_document_vectorized = vectorizer.fit_transform([new_doc_preprocess])
new_document_vectorized

<1x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [330]:
# # vectorizer = TfidfVectorizer(stop_words="english")
# new_document_vectorized = vectorizer.fit_transform([new_doc_preprocess])

if new_document_vectorized.shape[1] < kmeans.cluster_centers_.shape[1]:
    new_document_vectorized = np.hstack((new_document_vectorized.toarray(), np.zeros((new_document_vectorized.shape[0], kmeans.cluster_centers_.shape[1]-new_document_vectorized.shape[1]))))

kmeans.predict(new_document_vectorized)

array([0])

In [346]:
data[data['cluster']==1]

Unnamed: 0,Case Number,Summary,cluster,PreProSum
2,779491,Criminal Appeal No. 204 of the Calcutta High ...,1,criminal calcutta high court special leave ord...
9,1092568,Civil Appeal No. 230 of 1959 of 1959 was Civi...,1,order dated october punjab high court writ cou...
27,38328,The High Court held that the rule was ultra v...,1,high court held rule ultra vires offending art...
28,1222631,The Bombay High Court granted a certificate f...,1,bombay high court granted certificate high cou...
38,491029,Civil Appeal No. 342 of 1959: Appeal by speci...,1,special leave decree dated calcutta high court...
...,...,...,...,...
9174,1486515,Civil Appeal No. 24 of 1967: Civil Appeal of ...,1,special leave calcutta high court answering fo...
9181,1548537,Civil Appeal No. 646 of 1967: Civil Appeal of...,1,overturned allahabad high court decision set a...
9182,1179086,An appeal was heard by the Bombay High Court ...,1,heard bombay high court first laxmidas raghuna...
9190,1600971,The State of Assam and Nagaland High Court in...,1,state assam nagaland high court rule rameshwar...


In [331]:
data['cluster']=cluster_labels
data['PreProSum']=preprocessed_summaries
for sum in data[data['cluster']==1]['Summary']:
    print(sum)
    print('-----------------')

 Criminal Appeal No. 204 of the Calcutta High Court . Appeal by special leave from the judgment and order dated June 23, 1959 . The case against Kangsari Haldar and Jogendra Nath Guria (hereinafter called the appellants) was heard by Mitter and Bhattacharya, JJ. ; but there was a difference of opinion between the learned judges .
-----------------
 Civil Appeal No. 230 of 1959 of 1959 was Civil Appeal from the judgment and order dated October 7, 1955, of the Punjab High Court, in Civil Writ Petition No. 322 of 1953 . The Judgment of the Court was delivered by Shardar Kapur Singh (who will hereinafter be referred to as the appellant) The appellant was admitted by the Secretary of the State to the State in Council to the Indian Civil Service .
-----------------
 The High Court held that the rule was ultra vires as offending Art.	 14 of the Â  Â  Â  Â  Â  Â  Â  Â Constitution . The parties filed an application for a certificate under Art.298(1) of the High Court . The application was reje

In [332]:

# # Fit the K-Means object on the vectorized summaries to cluster them
# cluster_labels = kmeans.fit_predict(vectorized_summaries)

# Get the indices of the documents belonging to each cluster
cluster_indices = [np.where(cluster_labels == i)[0] for i in range(NO_OF_CLUSTERS)]
# Loop over each cluster
for i, indices in enumerate(cluster_indices):
    # Get the documents belonging to the current cluster
    cluster_documents = [summaries[index] for index in indices]
    
    # Create a TF-IDF vectorizer and fit it on the documents belonging to the current cluster
    vectorizer = TfidfVectorizer(stop_words=all_stop_words)
    tfidf_matrix = vectorizer.fit_transform(cluster_documents)
    
    # Get the sum of the TF-IDF scores for each word across all the documents belonging to the current cluster
    word_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
    sorted_word_indices = np.argsort(word_scores)[::-1]
    
    # Print the top 10 words for the current cluster
    print(f"\nTop 10 words for Cluster {i}:")
    feature_names = np.array(vectorizer.get_feature_names_out())
    for j in range(10):
        print(f"{feature_names[sorted_word_indices[j]]} ({word_scores[sorted_word_indices[j]]:.2f})")



Top 10 words for Cluster 0:
court (108.48)
high (93.72)
act (70.29)
respondent (54.72)
state (53.20)
rs (51.85)
case (51.63)
company (49.26)
order (49.16)
two (39.56)

Top 10 words for Cluster 1:
court (93.74)
high (79.92)
order (57.19)
special (49.05)
leave (47.29)
dated (44.10)
delivered (39.29)
punjab (33.47)
bombay (32.90)
act (30.51)

Top 10 words for Cluster 2:
appeals (31.47)
court (24.59)
high (21.37)
nos (21.13)
dated (15.41)
order (13.77)
delivered (12.71)
special (12.42)
leave (11.04)
tax (11.00)

Top 10 words for Cluster 3:
petitioner (24.45)
act (23.15)
order (22.95)
detention (22.40)
constitution (21.20)
india (20.43)
32 (18.44)
court (15.66)
art (15.51)
article (15.40)

Top 10 words for Cluster 4:
land (30.10)
act (22.52)
court (22.11)
high (20.06)
tenant (16.36)
lands (15.99)
respondent (14.43)
rent (12.88)
bombay (11.48)
rs (11.28)

Top 10 words for Cluster 5:
election (20.43)
court (13.82)
high (12.97)
respondent (10.58)
constituency (8.49)
declared (7.91)
assembly (

### ----------------------------------------------------------------------------

In [386]:
NO_OF_CLUSTERS=5

In [388]:
vectorised_data = [] #Vector Data Matrix
vectorizer = TfidfVectorizer()
maxL=0
for summary in preprocessed_summaries:
    new_document_vectorized = vectorizer.fit_transform([summary])
    new_document_vectorized = np.hstack((new_document_vectorized.toarray(), np.zeros((new_document_vectorized.shape[0], 52-new_document_vectorized.shape[1]))))
    vectorised_data.append(new_document_vectorized[0])


In [389]:
vectorised_data

[array([0.16222142, 0.16222142, 0.16222142, 0.16222142, 0.16222142,
        0.16222142, 0.16222142, 0.16222142, 0.16222142, 0.16222142,
        0.16222142, 0.16222142, 0.16222142, 0.16222142, 0.16222142,
        0.16222142, 0.16222142, 0.32444284, 0.16222142, 0.16222142,
        0.16222142, 0.16222142, 0.16222142, 0.16222142, 0.16222142,
        0.32444284, 0.16222142, 0.16222142, 0.16222142, 0.16222142,
        0.16222142, 0.16222142, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]),
 array([0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.39223227,
        0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614,
        0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614,
        0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614,
        0.1961

In [390]:
kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, random_state=42)

# Apply the K-Means algorithm to the vectorized summaries
cluster_labels = kmeans.fit_predict(vectorised_data)
cluster_labels



array([3, 1, 4, ..., 3, 4, 3])

In [364]:
data['cluster']=cluster_labels
data['PreProSum']=preprocessed_summaries
for sum in data[data['cluster']==1]['PreProSum']:
    print(sum)
    print('-----------------')

drug magic remedy objectionable advertisement act xxi passed april came force april petitioner writ hamdard dawakhana wakf another alleged experienced difficulty matter publicity product
-----------------
no filed art constitution india petitioner alleged various action taken authority violated fundamental right act passed april came force april along rule made thereunder
-----------------
emden hereinafter called convicted indian penal code prevention corruption act case accepted bribe r sarat chandra shukla january prosecution defence led evidence support respective sentence
-----------------
take note read using virtual legal assistant get email alert whenever new match query query alert service try premium member service free one month
-----------------
anant ramachandra karve convicted murder laxmibai lagu lagu sentenced life prison administering poison caused death case confirmed bombay high court separate
-----------------
allahabad high court lucknow bench lucknow lucknow broug

In [371]:
new_document="high court reversed decree subordinate judge motihari dated march suit filed present appellant declaration title odd bighas ryotikasht land possession thereof defendant"


In [372]:
preprocess_summary_new_doc = preprocess_summary(new_document)
new_document_vectorized = vectorizer.fit_transform([preprocess_summary_new_doc])
new_document_vectorized = np.hstack((new_document_vectorized.toarray(), np.zeros((new_document_vectorized.shape[0], 52-new_document_vectorized.shape[1]))))

In [373]:
kmeans.predict(new_document_vectorized)

array([1])

In [351]:
new_document_vectorized.shape[1]

5000

### GET TOP words in each cluster

In [352]:

# # Fit the K-Means object on the vectorized summaries to cluster them
# cluster_labels = kmeans.fit_predict(vectorized_summaries)

# Get the indices of the documents belonging to each cluster
cluster_indices = [np.where(cluster_labels == i)[0] for i in range(NO_OF_CLUSTERS)]

# Loop over each cluster
for i, indices in enumerate(cluster_indices):
    # Get the documents belonging to the current cluster
    cluster_documents = [preprocessed_summaries[index] for index in indices]
    
    # Create a TF-IDF vectorizer and fit it on the documents belonging to the current cluster
    vectorizer = TfidfVectorizer(stop_words=all_stop_words)
    tfidf_matrix = vectorizer.fit_transform(cluster_documents)
    
    # Get the sum of the TF-IDF scores for each word across all the documents belonging to the current cluster
    word_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
    sorted_word_indices = np.argsort(word_scores)[::-1]
    
    # Print the top 10 words for the current cluster
    print(f"\nTop 10 words for Cluster {i+1}:")
    feature_names = np.array(vectorizer.get_feature_names_out())
    for j in range(10):
        print(f"{feature_names[sorted_word_indices[j]]} ({word_scores[sorted_word_indices[j]]:.2f})")



Top 10 words for Cluster 1:
court (39.43)
high (33.97)
delivered (21.08)
dated (17.38)
order (16.92)
leave (15.60)
special (14.89)
act (14.21)
bombay (13.97)
case (12.66)

Top 10 words for Cluster 2:
court (47.49)
service (42.31)
alert (41.21)
query (41.14)
high (40.68)
act (27.71)
order (25.65)
one (24.84)
month (22.58)
assistant (22.37)

Top 10 words for Cluster 3:
court (42.75)
high (36.96)
order (24.07)
act (22.46)
dated (19.94)
special (18.65)
leave (18.49)
india (17.06)
respondent (17.05)
delivered (16.73)

Top 10 words for Cluster 4:
court (24.97)
high (21.35)
respondent (17.71)
act (16.92)
order (16.74)
singh (13.41)
dated (13.33)
two (13.07)
state (12.73)
bombay (11.78)

Top 10 words for Cluster 5:
court (40.00)
high (33.83)
order (24.40)
act (24.27)
respondent (21.47)
dated (18.80)
state (17.95)
bombay (16.97)
special (16.77)
year (16.64)


In [212]:
new_doc =" convicted code sentence court high section"

In [256]:
# # Vectorize the new document using the same vectorizer used for the original documents
# new_doc_vectorized = vectorizer.transform([new_doc])

# # Use the trained K-Means model to predict the cluster label for the new document
# predicted_cluster_label = kmeans.predict(new_doc_vectorized)

# # Assign the new document to the predicted cluster
# new_doc_cluster = cluster_documents[predicted_cluster_label[0]]
# new_doc_cluster.append(new_doc)

# Load the trained KMeans model and vectorizer
# with open('kmeans_model.pkl', 'rb') as f:
#     kmeans_model = pickle.load(f)
# with open('vectorizer.pkl', 'rb') as f:
#     vectorizer = pickle.load(f)

# Load the new document
new_document = new_doc

# Vectorize the new document
new_document_vectorized = vectorizer.transform([new_document])

# Ensure that the new document has the same number of features as the training data
if new_document_vectorized.shape[1] < kmeans.cluster_centers_.shape[1]:
    new_document_vectorized = np.hstack((new_document_vectorized.toarray(), np.zeros((new_document_vectorized.shape[0], kmeans.cluster_centers_.shape[1]-new_document_vectorized.shape[1]))))

# Predict the cluster label of the new document
cluster_label = kmeans.predict(new_document_vectorized)

print('The new document belongs to cluster:', cluster_label[0])


The new document belongs to cluster: 0


In [308]:
vectorized_summaries

<3680x9961 sparse matrix of type '<class 'numpy.float64'>'
	with 87257 stored elements in Compressed Sparse Row format>

In [309]:
for i in range(2,100):
# Define the K-Means clustering algorithm
    kmeans = KMeans(n_clusters=i, random_state=42)

    # Apply the K-Means algorithm to the vectorized summaries
    cluster_labels = kmeans.fit_predict(vectorised_data)

    # Evaluate the clustering results using the silhouette score
    silhouette_avg = silhouette_score(vectorised_data, cluster_labels)
    print(f'Silhouette score: {silhouette_avg:.2f} | no of clusters : {i}')




Silhouette score: 0.21 | no of clusters : 2




Silhouette score: 0.19 | no of clusters : 3




Silhouette score: 0.17 | no of clusters : 4




Silhouette score: 0.15 | no of clusters : 5




Silhouette score: 0.14 | no of clusters : 6




Silhouette score: 0.13 | no of clusters : 7




Silhouette score: 0.12 | no of clusters : 8




Silhouette score: 0.12 | no of clusters : 9




Silhouette score: 0.12 | no of clusters : 10




Silhouette score: 0.12 | no of clusters : 11




Silhouette score: 0.12 | no of clusters : 12




Silhouette score: 0.12 | no of clusters : 13




Silhouette score: 0.11 | no of clusters : 14




Silhouette score: 0.11 | no of clusters : 15




Silhouette score: 0.11 | no of clusters : 16




Silhouette score: 0.11 | no of clusters : 17




Silhouette score: 0.11 | no of clusters : 18




Silhouette score: 0.11 | no of clusters : 19




Silhouette score: 0.11 | no of clusters : 20




Silhouette score: 0.11 | no of clusters : 21




Silhouette score: 0.11 | no of clusters : 22




Silhouette score: 0.11 | no of clusters : 23




Silhouette score: 0.11 | no of clusters : 24




Silhouette score: 0.12 | no of clusters : 25




Silhouette score: 0.11 | no of clusters : 26




Silhouette score: 0.11 | no of clusters : 27




KeyboardInterrupt: 

In [200]:
for i in range(2,20):
# Define the K-Means clustering algorithm
    kmeans = KMeans(n_clusters=i, random_state=42)

    # Apply the K-Means algorithm to the vectorized summaries
    cluster_labels = kmeans.fit_predict(vectorized_summaries)

    # Evaluate the clustering results using the silhouette score
    silhouette_avg = silhouette_score(vectorized_summaries, cluster_labels)
    print(f'Silhouette score: {silhouette_avg:.2f} | no of clusters : {i}')




Silhouette score: 0.04 | no of clusters : 2




Silhouette score: 0.03 | no of clusters : 3




Silhouette score: 0.03 | no of clusters : 4




Silhouette score: 0.03 | no of clusters : 5




Silhouette score: 0.03 | no of clusters : 6




Silhouette score: 0.04 | no of clusters : 7




Silhouette score: 0.04 | no of clusters : 8




Silhouette score: 0.04 | no of clusters : 9




Silhouette score: 0.04 | no of clusters : 10




Silhouette score: 0.04 | no of clusters : 11




Silhouette score: 0.04 | no of clusters : 12




Silhouette score: 0.04 | no of clusters : 13




Silhouette score: 0.04 | no of clusters : 14




Silhouette score: 0.04 | no of clusters : 15




Silhouette score: 0.04 | no of clusters : 16




Silhouette score: 0.04 | no of clusters : 17




Silhouette score: 0.04 | no of clusters : 18




Silhouette score: 0.04 | no of clusters : 19


In [None]:
# Print the top key words for each cluster
n_top_keywords = 10
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer

In [None]:
data['cluster']=cluster_labels
data['PreProSum']=preprocessed_summaries

In [None]:
data

In [None]:
data[data['cluster']==4]['Summary'][:10]

In [None]:
for sum in data[data['cluster']==4]['PreProSum']:
    print(sum)
    print('-----------------')

In [None]:
 import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import string

# Define the vectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))

# Vectorize the documents
X = vectorizer.fit_transform(data)

# Cluster the documents
n_clusters = 5
km = KMeans(n_clusters=n_clusters, random_state=0)
km.fit(X)

# Get the cluster labels for each document
labels = km.labels_

# Get the text for each cluster
cluster_text = ['' for _ in range(n_clusters)]
for i, text in enumerate(data):
    cluster_text[labels[i]] += ' ' + text

# Clean the text and compute tf-idf scores
cleaned_text = [' '.join(word for word in text.lower().split() if word not in stopwords.words('english') and word not in string.punctuation) for text in cluster_text]
X_cluster = vectorizer.transform(cleaned_text)
tfidf_scores = np.asarray(X_cluster.mean(axis=0)).ravel()

# Print the top 10 keywords for each cluster
for i in range(n_clusters):
    start = i * vectorizer.get_feature_names()
    end = (i + 1) * vectorizer.get_feature_names()
    indices = tfidf_scores[start:end].argsort()[::-1][:10]
    keywords = [vectorizer.get_feature_names()[i] for i in indices]
    print(f'Cluster {i}: {keywords}')


In [None]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Define the number of clusters
num_clusters = 5

# Create a K-Means clustering object
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the K-Means object on the vectorized summaries to cluster them
cluster_labels = kmeans.fit_predict(vectorized_summaries)

# Get the indices of the documents belonging to each cluster
cluster_indices = [np.where(cluster_labels == i)[0] for i in range(num_clusters)]

# Loop over each cluster
for i, indices in enumerate(cluster_indices):
    # Get the documents belonging to the current cluster
    cluster_documents = [summaries[index] for index in indices]
    
    # Create a TF-IDF vectorizer and fit it on the documents belonging to the current cluster
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(cluster_documents)
    
    # Get the sum of the TF-IDF scores for each word across all the documents belonging to the current cluster
    word_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
    sorted_word_indices = np.argsort(word_scores)[::-1]
    
    # Print the top 10 words for the current cluster
    print(f"\nTop 10 words for Cluster {i+1}:")
    feature_names = np.array(vectorizer.get_feature_names())
    for j in range(10):
        print(f"{feature_names[sorted_word_indices[j]]} ({word_scores[sorted_word_indices[j]]:.2f})")


ISSUES : 
1) All getting into same cluster due to dim mismatch

## ----------------------------------- 

## -------------------------------------------------------------- WORD2VEC 

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [None]:
documents = summaries
processed_docs = [word_tokenize(doc.lower()) for doc in documents]


In [None]:
model = Word2Vec(processed_docs, window=5, min_count=1, workers=4)


In [None]:
# Get the vector representation of a word
vector = model.wv['word']

# Get the vector representation of a document summary
doc_vector = model.infer_vector(word_tokenize('Document summary'))


In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# Create a list of tagged documents
tagged_docs = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[i]) for i, doc in enumerate(summaries)]

# Train the Doc2Vec model
model = Doc2Vec(tagged_docs, vector_size=100, window=5, min_count=1, workers=4)

# Infer a vector representation for a new document
new_doc_vector = model.infer_vector(word_tokenize('Document summary'))

# Find similar documents to the new document
similar_docs = model.docvecs.most_similar([new_doc_vector])


In [None]:
similar_docs


In [None]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np

In [None]:
def preprocess_text(text):
    # your preprocessing code here
    return text


In [None]:
# model = Word2Vec.load('path/to/word2vec/model')

In [None]:
def get_document_vector(document):
    # preprocess the document
    preprocessed_doc = preprocess_text(document)
    # split the preprocessed document into words
    words = preprocessed_doc.split()
    # remove words that are not in the Word2Vec model's vocabulary
    words = [word for word in words if word in model.wv.vocab]
    # calculate the average of all word vectors in the document
    if len(words) > 0:
        vector = np.mean(model[words], axis=0)
    else:
        vector = np.zeros((model.vector_size,))
    return vector


In [None]:
def perform_clustering(documents, num_clusters):
    # get the vector representation of each document
    document_vectors = [get_document_vector(doc) for doc in documents]
    # perform K-Means clustering
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(document_vectors)
    # return the cluster labels for each document
    return kmeans.labels_


In [None]:
documents = summaries
num_clusters = 2
cluster_labels = perform_clustering(documents, num_clusters)


In [None]:
from gensim.models import Word2Vec,Doc2Vec
from nltk.tokenize import word_tokenize


In [None]:
documents = summaries
processed_docs = [word_tokenize(doc.lower()) for doc in documents]


In [None]:
model = Word2Vec(processed_docs, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
# Get the vector representation of a word
vector = model.wv['word']

# Get the vector representation of a document summary
doc_vector = model.infer_vector(word_tokenize('Document summary'))


In [None]:
# Find similar words to 'word'
similar_words = model.wv.most_similar('word')

# Find similar document summaries to 'Document summary 1'
similar_docs = model.docvecs.most_similar([model.infer_vector(word_tokenize('Document summary 1.'))])


In [None]:
!word2vec --version