In [None]:
# Read CSV file stored in Google Drive
from google.colab import drive # Import 'drive' module from the 'google.colab' library
drive.mount('/content/drive') # This mounts your Google Drive into the Colab virtual environment.

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/reuters.csv'
df = pd.read_csv(file_path)

print(df.head())

                                                text  \
0  ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...   
1  CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...   
2  JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...   
3  THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n  ...   
4  INDONESIA SEES CPO PRICE RISING SHARPLY\n  Ind...   

                                     categories  
0                                         trade  
1                                         grain  
2                                crude, nat-gas  
3  corn, grain, rice, rubber, sugar, tin, trade  
4                             palm-oil, veg-oil  


In [None]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Download necessary datasets
nltk.download('reuters')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
# Get all document IDs
docs = reuters.fileids()

# Split into train and test sets (as predefined by Reuters)
train_docs = [doc for doc in docs if doc.startswith("training/")]
test_docs = [doc for doc in docs if doc.startswith("test/")]
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(doc_id):
    words = [w.lower() for w in reuters.words(doc_id) if w.isalpha()]
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

train_texts = [preprocess(doc) for doc in train_docs]
test_texts = [preprocess(doc) for doc in test_docs]

train_labels = [reuters.categories(doc)[0] if reuters.categories(doc) else 'unknown' for doc in train_docs]
test_labels = [reuters.categories(doc)[0] if reuters.categories(doc) else 'unknown' for doc in test_docs]
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
model = MultinomialNB()
model.fit(X_train, train_labels)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(test_labels, y_pred))
print("\nClassification Report:\n", classification_report(test_labels, y_pred))




[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.7654852600198742

Classification Report:
                  precision    recall  f1-score   support

            acq       0.79      0.97      0.87       719
           alum       0.00      0.00      0.00        22
         barley       0.00      0.00      0.00        14
            bop       0.00      0.00      0.00        30
        carcass       0.00      0.00      0.00        17
     castor-oil       0.00      0.00      0.00         1
          cocoa       1.00      0.35      0.52        17
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         2
         coffee       1.00      0.72      0.84        25
         copper       1.00      0.07      0.12        15
           corn       0.63      0.54      0.58        48
         cotton       0.00      0.00      0.00        14
            cpi       1.00      0.12      0.22        24
            cpu       0.00      0.00      0.00         1
          crude       0.67      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
