In [1]:
# Import necessary libraries
import numpy as np  # NumPy for numerical operations
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF Vectorizer for text feature extraction
from sklearn.model_selection import train_test_split  # Splitting the data into training and testing sets
from sklearn.naive_bayes import MultinomialNB  # Multinomial Naive Bayes classifier
from sklearn.metrics import classification_report  # Reporting classification metrics
from nltk.corpus import reuters  # NLTK Reuters corpus for text data
from nltk.corpus import stopwords  # NLTK stopwords for filtering common words
from nltk.tokenize import word_tokenize  # NLTK word tokenizer for text tokenization
from nltk.stem import PorterStemmer  # NLTK Porter Stemmer for word stemming


In [2]:
# Download NLTK resources (if not already downloaded)
import nltk
nltk.download("reuters")  # Download the Reuters corpus from NLTK
nltk.download("stopwords")  # Download NLTK stopwords
nltk.download("punkt")  # Download NLTK Punkt tokenizer for word tokenization

# Load Reuters-21578 dataset
documents = reuters.fileids()  # Get the file IDs of documents in the Reuters corpus
categories = reuters.categories()  # Get the list of categories in the Reuters corpus

# Define a function to preprocess and tokenize the documents
def preprocess_and_tokenize(document):
    stop_words = set(stopwords.words("english"))  # Get English stopwords from NLTK
    stemmer = PorterStemmer()  # Create a Porter Stemmer instance from NLTK
    words = word_tokenize(document)  # Tokenize the document using NLTK word tokenizer
    words = [stemmer.stem(word.lower()) for word in words if word.isalnum()]  # Stem and convert to lowercase
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)  # Join the processed words into a string

# Preprocess and tokenize the documents
data = []
target = []
for doc_id in documents:
    category = reuters.categories(doc_id)[0]  # Get the category of the document
    document = reuters.raw(doc_id)  # Get the raw text of the document
    processed_doc = preprocess_and_tokenize(document)  # Preprocess and tokenize the document
    data.append(processed_doc)
    target.append(category)

# Convert data and target into a DataFrame
df = pd.DataFrame({'data': data, 'target': target})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['data'], df['target'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()  # Create a TF-IDF vectorizer instance
X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit and transform the training data
X_test_tfidf = vectorizer.transform(X_test)  # Transform the test data using the fitted vectorizer

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()  # Create a Multinomial Naive Bayes classifier instance
classifier.fit(X_train_tfidf, y_train)  # Train the classifier using the TF-IDF transformed training data

# Predict categories on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the classifier
report = classification_report(y_test, y_pred)  # Generate a classification report
print(report)  # Print the classification report to the console


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                 precision    recall  f1-score   support

            acq       0.54      0.96      0.69       469
           alum       0.00      0.00      0.00         7
         barley       0.00      0.00      0.00         6
            bop       0.00      0.00      0.00        20
        carcass       0.00      0.00      0.00        15
     castor-oil       0.00      0.00      0.00         2
          cocoa       0.00      0.00      0.00        17
         coffee       0.00      0.00      0.00        25
         copper       0.00      0.00      0.00         9
           corn       1.00      0.21      0.34        48
         cotton       0.00      0.00      0.00         8
            cpi       0.00      0.00      0.00        19
            cpu       0.00      0.00      0.00         1
          crude       0.85      0.67      0.75        96
            dlr       1.00      0.18      0.31        33
           earn       0.72      0.98      0.83       780
           fuel       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
# Import necessary library
import numpy as np  # NumPy for numerical operations

# Define the proportions for splitting the data
train_ratio = 0.7  # Set the proportion for the training set
test_ratio = 0.15  # Set the proportion for the test set

# Shuffle the data
df_shuffled = df.sample(frac=1, random_state=42)  # Shuffle the DataFrame randomly with a fixed seed

# Split the data into three categories: training set, test set, and unused set
train_size = int(train_ratio * len(df_shuffled))  # Calculate the size of the training set
test_size = int(test_ratio * len(df_shuffled))  # Calculate the size of the test set
unused_size = len(df_shuffled) - train_size - test_size  # Calculate the size of the unused set

# Assign IDs and group names
df_shuffled['id'] = range(len(df_shuffled))  # Assign unique IDs to each row
df_shuffled['group'] = np.where(df_shuffled.index < train_size, 'training_set', np.where(df_shuffled.index < train_size + test_size, 'test_set', 'unused_set'))  # Assign group names based on index

# Separate the data into the three categories
training_set = df_shuffled[df_shuffled['group'] == 'training_set']  # Extract rows belonging to the training set
test_set = df_shuffled[df_shuffled['group'] == 'test_set']  # Extract rows belonging to the test set
unused_set = df_shuffled[df_shuffled['group'] == 'unused_set']  # Extract rows belonging to the unused set

# Now, you have three separate DataFrames: training_set, test_set, and unused_set
# Each DataFrame contains data, target, id, and group columns
print("Training Set:")
print(training_set)

print("\nTest Set:")
print(test_set)

print("\nUnused Set:")
print(unused_set)


Training Set:
                                                   data    target     id  \
4593  bayer world group 1986 profit billion mark bil...      earn      0   
3614  comput microfilm corp lt comi year net shr 23 ...      earn      2   
510   spain foreign reserv rise februari spain forei...  reserves      6   
7331  gerber lt grb buy gerber system lt gsti share ...       acq      7   
5458  santa anita realti lt sar quarterli dividend q...      earn      8   
...                                                 ...       ...    ...   
5734  dst system inc lt dst regular payout set qtli ...      earn  10783   
5191  ocean inc lt obci year net shr eight ct vs sev...      earn  10784   
5390  drexel offici ha stake epsilon data lt epsi se...       acq  10785   
860   pennsylvania real estat invest trust lt pei op...      earn  10786   
7270  circon corp lt ccon 4th qtr shr loss two ct vs...      earn  10787   

             group  
4593  training_set  
3614  training_set  
510   trai

In [4]:
pip install gensim




In [5]:
import gensim
from gensim import corpora
import nltk
nltk.download("reuters")

# Your data preprocessing code here


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [8]:
# Tokenize your data (assuming 'data' is your preprocessed text data)
tokenized_data = [document.split() for document in df['data']]
# Split each preprocessed document into a list of tokens (words) and store them in 'tokenized_data'

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(tokenized_data)
# Create a Gensim Dictionary, which maps each unique token to a unique integer ID
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
# Create a Gensim Corpus, which represents each document as a bag-of-words using the token IDs from the dictionary


In [9]:
# Define a function to run LDA with different numbers of topics
def run_lda(num_topics):
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    return lda_model

# Try different numbers of topics
num_topics_list = [5, 10, 15, 20]

for num_topics in num_topics_list:
    print(f"Number of Topics: {num_topics}")  # Print the number of topics being evaluated
    lda_model = run_lda(num_topics)  # Run LDA with the specified number of topics
    topics = lda_model.print_topics(num_words=10)  # Get the top 10 words for each topic
    for topic in topics:
        print(topic)  # Print the words associated with each topic
    print("\n")  # Print a newline for better readability


Number of Topics: 5
(0, '0.030*"tonn" + 0.025*"said" + 0.019*"mln" + 0.017*"export" + 0.012*"year" + 0.010*"wheat" + 0.010*"sugar" + 0.008*"agricultur" + 0.008*"product" + 0.008*"grain"')
(1, '0.104*"vs" + 0.077*"mln" + 0.062*"ct" + 0.048*"net" + 0.041*"loss" + 0.037*"shr" + 0.036*"dlr" + 0.024*"lt" + 0.020*"profit" + 0.019*"qtr"')
(2, '0.035*"pct" + 0.026*"said" + 0.026*"billion" + 0.024*"mln" + 0.022*"bank" + 0.020*"dlr" + 0.018*"year" + 0.013*"1986" + 0.011*"rate" + 0.008*"wa"')
(3, '0.033*"said" + 0.011*"would" + 0.011*"trade" + 0.010*"wa" + 0.009*"oil" + 0.008*"market" + 0.007*"price" + 0.007*"ha" + 0.006*"japan" + 0.005*"thi"')
(4, '0.042*"said" + 0.027*"lt" + 0.022*"compani" + 0.022*"share" + 0.018*"dlr" + 0.012*"inc" + 0.011*"corp" + 0.010*"ha" + 0.010*"pct" + 0.010*"mln"')


Number of Topics: 10
(0, '0.045*"said" + 0.030*"share" + 0.026*"lt" + 0.021*"compani" + 0.016*"stock" + 0.016*"offer" + 0.015*"inc" + 0.013*"pct" + 0.013*"ha" + 0.012*"dlr"')
(1, '0.066*"dlr" + 0.044*"bill

In [10]:
# Import necessary libraries
import gensim  # Gensim for topic modeling
from gensim import corpora  # Gensim's corpora module for creating a dictionary and a corpus
import nltk  # Natural Language Toolkit for text processing
import pandas as pd  # Pandas for data manipulation and analysis
import numpy as np  # NumPy for numerical operations

# Your data preprocessing code here (assumed to be present but not provided)

# Tokenize your data (assuming 'data' is your preprocessed text data)
tokenized_data = [document.split() for document in df['data']]
# Split each preprocessed document into a list of tokens (words) and store them in 'tokenized_data'

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(tokenized_data)
# Create a Gensim Dictionary, which maps each unique token to a unique integer ID
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
# Create a Gensim Corpus, which represents each document as a bag-of-words using the token IDs from the dictionary

# Define a function to run LDA with different numbers of topics
def run_lda(num_topics):
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    # Run LDA with the specified number of topics, using the dictionary and corpus created earlier
    return lda_model

# Try different numbers of topics
num_topics_list = [5]

for num_topics in num_topics_list:
    print(f"Number of Topics: {num_topics}")  # Print the number of topics being evaluated
    lda_model = run_lda(num_topics)  # Run LDA with the specified number of topics

    # Assign topics to documents
    topics_for_documents = [lda_model[document] for document in corpus]
    # Get the topics and their probabilities for each document in the corpus

    # Categorize documents into sets (training, test, unused)
    training_set_topics = [topics_for_documents[i] for i in training_set['id']]
    # Extract topics for documents in the training set
    test_set_topics = [topics_for_documents[i] for i in test_set['id']]
    # Extract topics for documents in the test set
    unused_set_topics = [topics_for_documents[i] for i in unused_set['id']]
    # Extract topics for documents in the unused set

    # Print the categorization for the training set
    print("Training Set Topics:")
    for i, topic_probabilities in enumerate(training_set_topics):
        max_topic = max(topic_probabilities, key=lambda x: x[1])
        print(f"Document {i} - Topic {max_topic[0]} with Probability {max_topic[1]}")
        # Print the document ID, assigned topic, and the probability of that topic

    # Print the categorization for the test set
    print("Test Set Topics:")
    for i, topic_probabilities in enumerate(test_set_topics):
        max_topic = max(topic_probabilities, key=lambda x: x[1])
        print(f"Document {i} - Topic {max_topic[0]} with Probability {max_topic[1]}")
        # Print the document ID, assigned topic, and the probability of that topic

    # Print the categorization for the unused set
    print("Unused Set Topics:")
    for i, topic_probabilities in enumerate(unused_set_topics):
        max_topic = max(topic_probabilities, key=lambda x: x[1])
        print(f"Document {i} - Topic {max_topic[0]} with Probability {max_topic[1]}")
        # Print the document ID, assigned topic, and the probability of that topic

    print("\n")  # Print a newline for better readability


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Document 5792 - Topic 2 with Probability 0.9830993413925171
Document 5793 - Topic 0 with Probability 0.6044766306877136
Document 5794 - Topic 0 with Probability 0.9625720977783203
Document 5795 - Topic 0 with Probability 0.9837740063667297
Document 5796 - Topic 0 with Probability 0.984001100063324
Document 5797 - Topic 4 with Probability 0.9857198596000671
Document 5798 - Topic 4 with Probability 0.9850953817367554
Document 5799 - Topic 4 with Probability 0.5043938755989075
Document 5800 - Topic 4 with Probability 0.9792929887771606
Document 5801 - Topic 2 with Probability 0.997636079788208
Document 5802 - Topic 3 with Probability 0.7145045399665833
Document 5803 - Topic 4 with Probability 0.34684258699417114
Document 5804 - Topic 1 with Probability 0.9708993434906006
Document 5805 - Topic 1 with Probability 0.989136815071106
Document 5806 - Topic 4 with Probability 0.5665979385375977
Document 5807 - Topic 3 with Probabil

optimality number of categories  using topic modelling with categories and true cateriogized

In [15]:
# Import necessary library
from gensim.models import CoherenceModel

# Calculate coherence scores for different numbers of topics
coherence_scores = []  # Initialize an empty list to store coherence scores
for num_topics in num_topics_list:
    lda_model = run_lda(num_topics)  # Run LDA with the specified number of topics
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_data, dictionary=dictionary, coherence='c_v')
    # Create a CoherenceModel instance for the LDA model using 'c_v' coherence
    coherence_score = coherence_model_lda.get_coherence()  # Calculate the coherence score
    coherence_scores.append(coherence_score)  # Append the coherence score to the list
    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence_score}")
    # Print the number of topics and its corresponding coherence score

# Find the optimal number of topics based on coherence score
optimal_num_topics = num_topics_list[np.argmax(coherence_scores)]  # Get the number of topics with the highest coherence score
print(f"Optimal Number of Topics: {optimal_num_topics}")
# Print the optimal number of topics based on the highest coherence score


Number of Topics: 5, Coherence Score: 0.5024227698091698
Optimal Number of Topics: 5


In [14]:
# Import necessary library
from gensim.models import CoherenceModel  # Import the CoherenceModel class from Gensim

import numpy as np  # Import NumPy for numerical operations


# Your code for calculating coherence scores

# Define the top-level categories
top_level_categories = ["earn", "acq", "money-fx", "grain", "crude", "trade", "interest", "ship"]

# Print the optimal number of topics
print(f"Optimal Number of Topics: {optimal_num_topics}")

# Print the top-level categories
print("Top-Level Categories:")
for i, category in enumerate(top_level_categories):
    print(f"{i+1}. {category}")


Optimal Number of Topics: 5
Top-Level Categories:
1. earn
2. acq
3. money-fx
4. grain
5. crude
6. trade
7. interest
8. ship


In [13]:
# Import necessary libraries
import gensim  # Gensim for topic modeling
from gensim import corpora  # Gensim's corpora module for creating a dictionary and a corpus
import pandas as pd  # Pandas for data manipulation and analysis
from collections import defaultdict  # Import defaultdict from the collections module

# Your data preprocessing code here (assumed to be present but not provided)

# Tokenize your data (assuming 'data' is your preprocessed text data)
tokenized_data = [document.split() for document in df['data']]
# Split each preprocessed document into a list of tokens (words) and store them in 'tokenized_data'

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(tokenized_data)
# Create a Gensim Dictionary, which maps each unique token to a unique integer ID
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
# Create a Gensim Corpus, which represents each document as a bag-of-words using the token IDs from the dictionary

# Define a function to run LDA with a specific number of topics
def run_lda(num_topics):
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    # Run LDA with the specified number of topics, using the dictionary and corpus created earlier
    return lda_model

# Number of topics
num_topics = 5  # Specify the number of topics for LDA

# Run LDA with 5 topics
lda_model = run_lda(num_topics)  # Run LDA with the specified number of topics

# Initialize a list to store categories
categories_list = []  # Create an empty list to store categories

# Calculate the word count for each category across the entire corpus
for category in categories:  # Iterate through each category (replace 'categories' with your actual category data)
    category_docs = df[df['target'] == category]  # Filter documents belonging to the current category

    if len(category_docs) > 0:  # Check if there are documents in the category
        categories_list.append(category)  # Append the category to the list if it has documents

# Print the categories
print("Categories:")
for category in categories_list:
    print(f"{category}")  # Print each category in the list


Categories:
acq
alum
barley
bop
carcass
castor-oil
cocoa
coconut
coconut-oil
coffee
copper
copra-cake
corn
cotton
cotton-oil
cpi
cpu
crude
dfl
dlr
dmk
earn
fuel
gas
gnp
gold
grain
groundnut
groundnut-oil
heat
hog
housing
income
instal-debt
interest
ipi
iron-steel
jet
jobs
l-cattle
lead
lei
livestock
lumber
meal-feed
money-fx
money-supply
naphtha
nat-gas
nickel
nzdlr
oilseed
orange
palladium
palm-oil
pet-chem
platinum
potato
propane
rand
rape-oil
reserves
retail
rice
rubber
ship
silver
soy-oil
soybean
strategic-metal
sugar
sun-oil
tea
tin
trade
veg-oil
wpi
yen
zinc
