<a href="https://colab.research.google.com/github/Himtut1998/Travel-review-ratings/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the documents from the Excel file
input_file = 'combined_documents.xlsx'
combined_df = pd.read_excel(input_file)

# Lemmatization and stop words removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize by word
    tokens = nltk.word_tokenize(text)
    # Lemmatize, remove stop words and non-alphabetic tokens
    lemmatized = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(lemmatized)

# Preprocess documents
combined_df['Processed_Documents'] = combined_df['Document'].apply(preprocess_text)

# Convert the text documents into TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7, ngram_range=(1,3))
X = vectorizer.fit_transform(combined_df['Processed_Documents'])

# Optionally, apply PCA for dimensionality reduction
# pca = PCA(n_components=0.95)  # Adjust based on your dataset
# X_reduced = pca.fit_transform(X.toarray())

# Use K-means to cluster the documents
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=42)
kmeans.fit(X)

# Assign the cluster labels to the original dataframe
combined_df['Cluster'] = kmeans.labels_

# Function to assign a new document to a cluster
def assign_to_cluster(new_document):
    preprocessed_document = preprocess_text(new_document)
    new_document_vector = vectorizer.transform([preprocessed_document])
    cluster = kmeans.predict(new_document_vector)[0]
    return cluster

# Evaluate the performance using the silhouette score
silhouette_avg = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Silhouette Score: 0.12619287568887538


In [2]:
# Example usage of the function with a new document
new_document = "The local football team clinched the championship after a thrilling penalty shootout, marking their third national title in five years. Fans celebrated their victory across the city, highlighting the team's remarkable journey from underdogs to champions."
predicted_cluster = assign_to_cluster(new_document)
print(f"Predicted cluster for the new document: {predicted_cluster}")


Predicted cluster for the new document: 1
