## Glove

In [None]:
#!pip install gensim

In [None]:
import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score

# Load word embeddings (can replace with your own Word2Vec/GloVe)
model = api.load('glove-wiki-gigaword-100')  # or 'word2vec-google-news-300'

def get_sentence_vector(sentence):
    words = [word for word in sentence.lower().split() if word in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[word] for word in words], axis=0)

## Segmenting similar tasks

In [None]:
tasks = [
    "clean data", "build model", "train algorithm", "remove outliers",
    "deploy model", "visualize data", "tune hyperparameters", "generate report"
]

task_vectors = [get_sentence_vector(task) for task in tasks]

# Use KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0)
labels = kmeans.fit_predict(task_vectors)

# Print grouped tasks
for i, label in enumerate(labels):
    print(f"Cluster {label}: {tasks[i]}")

Cluster 1: clean data
Cluster 0: build model
Cluster 0: train algorithm
Cluster 1: remove outliers
Cluster 0: deploy model
Cluster 1: visualize data
Cluster 2: tune hyperparameters
Cluster 1: generate report


## Checking accuracy in summary

In [None]:
def check_summary_accuracy(original_text, summary_text):
    orig_words = [word for word in original_text.lower().split() if word in model]
    summary_words = [word for word in summary_text.lower().split() if word in model]

    orig_vecs = np.array([model[word] for word in orig_words])
    summary_vecs = np.array([model[word] for word in summary_words])

    similarities = cosine_similarity(summary_vecs, orig_vecs)
    coverage = np.mean(np.max(similarities, axis=1))  # max sim for each summary word
    return coverage

original = "The quick brown fox jumps over the lazy dog and runs into the forest."
summary = "A brown fox jumps over a lazy dog."

accuracy_score = check_summary_accuracy(original, summary)
print("Summary accuracy score (cosine-based):", accuracy_score)

Summary accuracy score (cosine-based): 0.93599373


## Text Clustering

In [None]:
documents = [
    "Dogs are wonderful pets",
    "Cats are independent animals",
    "Dogs love to play fetch",
    "Cats love to nap",
    "Football is a great sport",
    "Soccer is popular worldwide",
]

doc_vectors = [get_sentence_vector(doc) for doc in documents]
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(doc_vectors)

for i, label in enumerate(clusters):
    print(f"Cluster {label}: {documents[i]}")


Cluster 0: Dogs are wonderful pets
Cluster 0: Cats are independent animals
Cluster 0: Dogs love to play fetch
Cluster 0: Cats love to nap
Cluster 1: Football is a great sport
Cluster 1: Soccer is popular worldwide


## Semantic Search

In [None]:
corpus = [
    "How to train a neural network",
    "Ways to clean data for machine learning",
    "Data visualization techniques",
    "Best practices for model deployment",
]

query = "visualizing data"
query_vec = get_sentence_vector(query)

corpus_vecs = [get_sentence_vector(doc) for doc in corpus]
sims = cosine_similarity([query_vec], corpus_vecs)[0]

results = sorted(zip(corpus, sims), key=lambda x: x[1], reverse=True)
print("Semantic Search Results:")
for doc, score in results:
    print(f"{score:.3f}: {doc}")

Semantic Search Results:
0.829: Data visualization techniques
0.634: Ways to clean data for machine learning
0.515: How to train a neural network
0.438: Best practices for model deployment
