In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from tabulate import tabulate
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Preprocessing function: Lowercasing, remove stopwords and stemming
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        preprocessed_docs = []
        for doc in X:
            # Lowercase the text
            doc = doc.lower()
            # Remove stopwords and apply stemming
            doc = " ".join([self.stemmer.stem(word) for word in doc.split() if word not in self.stop_words])
            preprocessed_docs.append(doc)
        return np.array(preprocessed_docs)


In [5]:
# Define the dataset
dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

# Preprocess the dataset
preprocessor = TextPreprocessor()
preprocessed_data = preprocessor.fit_transform(dataset)

# Step 4: Vectorization with TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# Step 5: Perform clustering using KMeans
k = 2 # Define number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the documents and their predicted clusters
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    0
Purity: 0.8


In [7]:
#Exercise 2

In [9]:
# Preprocessing function: Lowercasing, remove stopwords and stemming
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        preprocessed_docs = []
        for doc in X:
            # Lowercase the text
            doc = doc.lower()
            # Remove stopwords and apply stemming
            doc = " ".join([self.stemmer.stem(word) for word in doc.split() if word not in self.stop_words])
            preprocessed_docs.append(doc)
        return np.array(preprocessed_docs)


In [1]:
# Preprocessing function: Lowercasing, remove stopwords and stemming
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        preprocessed_docs = []
        for doc in X:
            # Lowercase the text
            doc = doc.lower()
            # Remove stopwords and apply stemming
            doc = " ".join([self.stemmer.stem(word) for word in doc.split() if word not in self.stop_words])
            preprocessed_docs.append(doc)
        return np.array(preprocessed_docs)


NameError: name 'BaseEstimator' is not defined

In [3]:
# Define the dataset
dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

# Preprocess the dataset
preprocessor = TextPreprocessor()
preprocessed_data = preprocessor.fit_transform(dataset)

# Step 4: Tokenize and Train Word2Vec model
tokenized_dataset = [doc.split() for doc in preprocessed_data]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

# Step 5: Create document embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0)
              for doc in preprocessed_data])

# Step 6: Perform clustering using KMeans
k = 2 # Define number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the documents and their predicted clusters
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


NameError: name 'TextPreprocessor' is not defined