In [33]:
import pandas as pd
import re

from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
#import the dataset
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/bbc-news-data.csv", sep='\t')
df=df.drop('filename', axis=1)

In [35]:
df["data_to_clean"] = df["title"] + " " + df["content"] 
def preprocess(text):
  #Remove non-word characters and lowercase
  text = re.sub(r"\W+", " ", str(text)).lower()
  #Remove double whitespaces
  text = re.sub(r"\s+", " ", text).strip()
  #Tokenize the text
  tokens = word_tokenize(text)
  #Remove stop words
  tokens = [token for token in tokens if token not in stopwords.words("english")]
  #Stem the tokens using Porter stemmer
  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(token) for token in tokens]
  #Join the stemmed tokens back into a string
  preprocessed_text = " ".join(stemmed_tokens)
  return preprocessed_text

In [36]:
test_examples = {}

#Select 10 rows from each category for the test set
for category in df['category'].unique():
    test_examples[category] = df[df['category'] == category].sample(n=10)
    
#Concatenate the test examples into a single DataFrame
test_df = pd.concat(test_examples.values())

#Remove the test examples from the original DataFrame to create the training set
train_df = df.drop(test_df.index)

In [37]:
#apply the preprocessing function to train_df and store it in a new column 
train_df['clean_text'] = train_df['data_to_clean'].apply(preprocess)

In [38]:
#use tfidf for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(train_df['clean_text'])

In [43]:
#train the model on the features from tfidf
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=123).fit(features)



In [47]:
cluster_to_category = {
    0: 'sport',
    1: 'politics',
    2: 'entertainment',
    3: 'business',
    4: 'tech'
}
#'politics',
#'sport'
#'tech'
#'entertainment'
#'business'

In [72]:
user_input=input("Enter your document:" )
clean_input= preprocess(user_input)
clean_input= [clean_input] 
input_vector = vectorizer.transform(clean_input)

prediction = kmeans.predict(input_vector)

predicted_category = [cluster_to_category[label] for label in prediction]

print(predicted_category)

Enter your document:the film avatar just launched on cinemas
['entertainment']


In [55]:
# Preprocess new document
for_testing="donald trump accused the democrats of being too soft"

# Preprocess the test set
test_df['clean_text'] = test_df['data_to_clean'].apply(preprocess)

# Convert the preprocessed test set to feature vectors
test_vectors = vectorizer.transform(test_df['clean_text'])

# Predict the clusters for the test set
test_df['predicted_cluster_kmeans'] =  kmeans.predict(test_vectors)
test_df['predicted_cluster_kmeans'] = test_df['predicted_cluster_kmeans'].apply(lambda x: cluster_to_category[x])

# Print the test set with the predicted clusters
print(test_df[['category', 'predicted_cluster_kmeans']])

           category predicted_cluster_kmeans
2          business                 business
53         business                 business
344        business                 business
258        business                 business
154        business                 business
277        business                 business
385        business                 business
191        business                 business
369        business                 business
99         business                 business
826   entertainment            entertainment
640   entertainment            entertainment
774   entertainment            entertainment
832   entertainment            entertainment
854   entertainment                 business
836   entertainment                 business
760   entertainment            entertainment
568   entertainment            entertainment
893   entertainment            entertainment
803   entertainment            entertainment
903        politics                 politics
1056      

In [68]:
from sklearn.metrics import adjusted_rand_score, f1_score, rand_score

# Extract the true labels and predicted labels
true_labels = test_df['category']
predicted_labels = test_df['predicted_cluster_kmeans']

# Calculate the Rand Index
rand_score2 = rand_score(true_labels, predicted_labels)
print('Rand Index:', rand_score2)

# Calculate the F1 score
f1 = f1_score(true_labels, predicted_labels, average='weighted')
print('F1 score:', f1)

Rand Index: 0.8497959183673469
F1 score: 0.8031746031746033


In [46]:
# Get most frequent words in each cluster
top_words = {}
feature_names = vectorizer.get_feature_names_out()

for i, cluster in enumerate(kmeans.cluster_centers_):
    top_words[i] = [feature_names[j] for j in cluster.argsort()[-10:]][::-1]
print(top_words)

{0: ['game', 'play', 'win', 'england', 'player', 'match', 'said', 'team', 'cup', 'final'], 1: ['mr', 'elect', 'labour', 'parti', 'blair', 'tori', 'said', 'brown', 'tax', 'howard'], 2: ['film', 'award', 'best', 'star', 'music', 'oscar', 'band', 'nomin', 'actor', 'year'], 3: ['said', 'us', 'mr', 'year', 'govern', 'compani', 'bank', 'firm', 'economi', 'growth'], 4: ['mobil', 'use', 'phone', 'peopl', 'game', 'technolog', 'said', 'user', 'comput', 'servic']}
