In [39]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [25]:
with open('data.txt', 'r') as f:
    lines = f.readlines()

In [26]:
def pre_process(data):
    clean_data = []
    for sentence in data:
        sentence = sentence.lower()
        sentence = sentence.replace('\n',"")
        sentence = re.sub(r'[^\w\s.]', '', sentence)
        sentence = re.sub(r'\b(\w+)\.', r'\1', sentence)
        clean_data.append(sentence)
    return clean_data
#         print(sentence)

In [35]:
pre_processed_data = pre_process(lines)
# pre_processed_data

In [31]:
vectorizer = TfidfVectorizer()

In [55]:
vectorizer

In [36]:
tfidf_matrix = vectorizer.fit_transform(pre_processed_data)

In [70]:
tfidf_matrix

<30x210 sparse matrix of type '<class 'numpy.float64'>'
	with 290 stored elements in Compressed Sparse Row format>

In [40]:
kmeans = KMeans(n_clusters=2, random_state=0)

In [41]:
kmeans.fit(tfidf_matrix)

In [45]:
# for i, sentence in enumerate(pre_processed_data):
#     print(f"Sentence: {sentence}")
#     print(f"Cluster: {kmeans.labels_[i]}")
#     print("-" * 20)





## All sentences in Cluster 0

In [49]:
for i, sentence in enumerate(pre_processed_data):
    if kmeans.labels_[i] == 0:
        print(f"Sentence: {sentence}")
        print(f"Cluster: {kmeans.labels_[i]}")

Sentence: our team excels at developing cuttingedge webbased applications
Cluster: 0
Sentence: we consistently monitor our network infrastructure for potential vulnerabilities
Cluster: 0
Sentence: upgrading our database system significantly enhanced query performance
Cluster: 0
Sentence: our data scientists are experts in extracting actionable trends
Cluster: 0
Sentence: our company is committed to implementing sustainable practices across all operations
Cluster: 0
Sentence: reducing our carbon footprint is a top priority for our organization
Cluster: 0
Sentence: we utilize renewable energy sources to power our facilities
Cluster: 0
Sentence: our company aims to achieve zero waste through a comprehensive recycling program
Cluster: 0
Sentence: reducing water consumption is an important part of our environmental policy
Cluster: 0
Sentence: environmental awareness campaigns aim to educate and inspire action
Cluster: 0
Sentence: partnering with environmental organizations amplifies our sus

## All sentences in Cluster 2

In [54]:
# print('total sentences in cluster 2:')



for i, sentence in enumerate(pre_processed_data):
    if kmeans.labels_[i] == 1:
        print(f"Sentence: {sentence}")
        print(f"Cluster: {kmeans.labels_[i]}")

Sentence: the software update rolled out smoothly enhancing user experience significantly
Cluster: 1
Sentence: disruptive technologies are revolutionizing the way businesses operate
Cluster: 1
Sentence: cloud computing services offer flexibility and scalability for data storage
Cluster: 1
Sentence: user interface design is key to delivering a seamless software experience
Cluster: 1
Sentence: big data analytics are providing insights that drive better business decisions
Cluster: 1
Sentence: we need to prioritize cybersecurity measures to safeguard sensitive data
Cluster: 1
Sentence: the new aipowered algorithm is remarkably efficient in processing complex data sets
Cluster: 1
Sentence: have you considered adopting a devsecops framework for secure software development
Cluster: 1
Sentence: continuous integration and deployment streamline our software release cycles
Cluster: 1
Sentence: machine learning models help us automate key prediction tasks
Cluster: 1
Sentence: investing in research

In [51]:
kmeans.labels_[1]

1

In [78]:
def display_condensed_tfidf(tfidf_matrix, vectorizer):
    """Displays a condensed view of the TF-IDF matrix."""

    n_docs = tfidf_matrix.shape[0]
    for i in range(n_docs):
        doc_terms = {}  
        row = tfidf_matrix.getrow(i)

        for term_index, score in zip(row.indices, row.data):
            term = vectorizer.get_feature_names_out()[term_index]
            doc_terms[term] = score

        print(f"Document {i}:")
        for term, score in doc_terms.items():
            print(f"  * {term}: {score:.3f}")
        print()

display_condensed_tfidf(tfidf_matrix, vectorizer)

Document 0:
  * significantly: 0.307
  * experience: 0.307
  * user: 0.307
  * enhancing: 0.344
  * smoothly: 0.344
  * out: 0.344
  * rolled: 0.344
  * update: 0.344
  * software: 0.260
  * the: 0.243

Document 1:
  * operate: 0.374
  * businesses: 0.374
  * way: 0.374
  * revolutionizing: 0.374
  * are: 0.304
  * technologies: 0.374
  * disruptive: 0.374
  * the: 0.264

Document 2:
  * storage: 0.345
  * data: 0.244
  * for: 0.230
  * scalability: 0.345
  * and: 0.230
  * flexibility: 0.345
  * offer: 0.345
  * services: 0.345
  * computing: 0.345
  * cloud: 0.345

Document 3:
  * seamless: 0.363
  * delivering: 0.363
  * to: 0.197
  * key: 0.323
  * is: 0.217
  * design: 0.363
  * interface: 0.363
  * experience: 0.323
  * user: 0.323
  * software: 0.274

Document 4:
  * decisions: 0.319
  * business: 0.319
  * better: 0.319
  * drive: 0.319
  * that: 0.260
  * insights: 0.319
  * providing: 0.319
  * analytics: 0.319
  * big: 0.319
  * data: 0.225
  * are: 0.260

Document 5:
  * ap

In [77]:
def display_full_tfidf(tfidf_matrix, vectorizer):
    n_docs, n_terms = tfidf_matrix.shape
    feature_names = vectorizer.get_feature_names_out()

    for doc_index in range(n_docs):
        print(f"Document {doc_index}:")
        for word_index in range(n_terms):
            word = feature_names[word_index]
            score = tfidf_matrix[doc_index, word_index]
            print(f"  * {word}: {score:.3f}")
        print()

display_full_tfidf(tfidf_matrix, vectorizer) 


Document 0:
  * achieve: 0.000
  * across: 0.000
  * action: 0.000
  * actionable: 0.000
  * adopting: 0.000
  * agriculture: 0.000
  * ahead: 0.000
  * aim: 0.000
  * aims: 0.000
  * aipowered: 0.000
  * air: 0.000
  * algorithm: 0.000
  * all: 0.000
  * amplifies: 0.000
  * an: 0.000
  * analytics: 0.000
  * and: 0.000
  * applications: 0.000
  * are: 0.000
  * at: 0.000
  * automate: 0.000
  * awareness: 0.000
  * better: 0.000
  * big: 0.000
  * biodiversity: 0.000
  * business: 0.000
  * businesses: 0.000
  * campaigns: 0.000
  * can: 0.000
  * carbon: 0.000
  * change: 0.000
  * choices: 0.000
  * clean: 0.000
  * climate: 0.000
  * cloud: 0.000
  * combating: 0.000
  * committed: 0.000
  * communities: 0.000
  * company: 0.000
  * complex: 0.000
  * composting: 0.000
  * comprehensive: 0.000
  * computing: 0.000
  * considered: 0.000
  * consistently: 0.000
  * consumption: 0.000
  * continuous: 0.000
  * contribute: 0.000
  * create: 0.000
  * crucial: 0.000
  * curve: 0.000
  

  * through: 0.000
  * to: 0.000
  * top: 0.000
  * transportation: 0.000
  * trends: 0.000
  * update: 0.000
  * upgrading: 0.000
  * us: 0.000
  * user: 0.000
  * utilize: 0.000
  * vital: 0.000
  * vulnerabilities: 0.000
  * waste: 0.000
  * water: 0.000
  * way: 0.000
  * we: 0.000
  * webbased: 0.000
  * with: 0.000
  * you: 0.000
  * zero: 0.000

Document 22:
  * achieve: 0.000
  * across: 0.000
  * action: 0.370
  * actionable: 0.000
  * adopting: 0.000
  * agriculture: 0.000
  * ahead: 0.000
  * aim: 0.370
  * aims: 0.000
  * aipowered: 0.000
  * air: 0.000
  * algorithm: 0.000
  * all: 0.000
  * amplifies: 0.000
  * an: 0.000
  * analytics: 0.000
  * and: 0.246
  * applications: 0.000
  * are: 0.000
  * at: 0.000
  * automate: 0.000
  * awareness: 0.370
  * better: 0.000
  * big: 0.000
  * biodiversity: 0.000
  * business: 0.000
  * businesses: 0.000
  * campaigns: 0.370
  * can: 0.000
  * carbon: 0.000
  * change: 0.000
  * choices: 0.000
  * clean: 0.000
  * climate: 0.000
 