In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import nltk
import string
from tqdm import tqdm
import os

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Load data from CSV
df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/dataset.csv")

In [5]:
def preprocess_text(text):

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove punctuation
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)

    # Convert to lowercase
    text = text.lower()

    # Perform Whitespace tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Perform stemming
    ps = PorterStemmer()
    tokens = [ps.stem(token) for token in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = " ".join(tokens)

    return processed_text

In [6]:
# Print original data
print("Original data:")
print(df)

Original data:
                                                  Topic  \
0                                    BAHIA COCOA REVIEW   
1       DEAN FOODS &lt;DF> SEES STRONG 4TH QTR EARNINGS   
2               MAGMA LOWERS COPPER 0.75 CENT TO 66 CTS   
3         JANUARY HOUSING SALES DROP, REALTY GROUP SAYS   
4     ASSETS OF MONEY MARKET MUTUAL FUNDS ROSE 720.4...   
...                                                 ...   
5705  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
5706                  JAPAN RUBBER STOCKS FALL IN MARCH   
5707            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
5708                  NIPPON MINING LOWERS COPPER PRICE   
5709   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text  Category  WordCount  \
0      Showers continued throughout the week in the ...     trade        486   
1      Dean Foods Co expects earnings for the fourth...       acq        233   
2      Magma Copper Co, a subsidiary

In [7]:
df["Combined_Text"] = df["Topic"] + " " + df["Text"]

In [8]:
# Clean and preprocess text
df["Text"] = df["Combined_Text"].apply(preprocess_text)

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
df = df.drop(['Combined_Text', 'Topic'], axis=1)

In [11]:
print("After HTML removal:")
print(df)

After HTML removal:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenCount  


In [12]:
print("After lowercase:")
print(df)

After lowercase:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenCount  
0  

In [13]:
print("After punctuation removal:")
print(df)

After punctuation removal:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenC

In [14]:
print("After stopword removal:")
print(df)

After stopword removal:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenCoun

In [15]:
print("After stemming:")
print(df)

After stemming:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenCount  
0   

In [16]:
print("After lemmatization:")
print(df)

After lemmatization:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenCount  

In [17]:
print("After removing whitespace:")
print(df)

After removing whitespace:
                                                   Text  Category  WordCount  \
0     bahia cocoa review shower continu throughout w...     trade        486   
1     dean food df see strong 4th qtr earn dean food...       acq        233   
2     magma lower copper 075 cent 66 ct magma copper...  interest         27   
3     januari hous sale drop realti group say sale p...      earn         92   
4     asset money market mutual fund rose 7204 mln d...      earn         13   
...                                                 ...       ...        ...   
5705  bank japan interven soon tokyo open bank japan...      earn         81   
5706  japan rubber stock fall march japan rubber sto...       acq         55   
5707  south korean fix 25month high bank korea said ...  interest         54   
5708  nippon mine lower copper price nippon mine co ...      earn         22   
5709  australian union launch new south wale strike ...  money-fx        175   

      TokenC

In [19]:
# Save cleaned and preprocessed data
df.to_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_training_data.csv", index=False)

In [26]:
dataset_path = r"/content/drive/MyDrive/Reuters-21578/reuters/reuters/reuters/test"
dataset_dirs = os.listdir(dataset_path)

In [48]:
test_data = []
for i in tqdm(dataset_dirs):
  file_path = f"{dataset_path}/{i}"
  try:
      with open(file_path, 'r', encoding='utf-8') as f:
          content = f.read()
          test_data.append(content)
  except UnicodeDecodeError:
      print(f"Skipping file due to decoding error: {file_path}")

 57%|█████▋    | 1722/3019 [00:02<00:01, 793.24it/s]

Skipping file due to decoding error: /content/drive/MyDrive/Reuters-21578/reuters/reuters/reuters/test/17980


100%|██████████| 3019/3019 [00:03<00:00, 788.34it/s]


In [49]:
test_data = [article.split('\n ') for article in test_data]

In [50]:
topics = [article.pop(0) for article in test_data]

In [51]:
test_data = ["".join(line) for line in test_data]

In [52]:
cat_file = r"/content/drive/MyDrive/Reuters-21578/reuters/reuters/reuters/cats.txt"

with open(cat_file, 'r') as f:
    lines = f.readlines()

lines = ["".join(x.strip('\n').split(',')) for x in lines]

paths = []
cats = []

for line in lines:
    temp_line = line.split(' ')
    paths.append(temp_line[0])
    cats.append(temp_line[1])

In [53]:
test_df = pd.DataFrame(list(zip(topics, test_data, cats)),
               columns =['Topic','Text', 'Category'])

In [55]:
test_df.head()

Unnamed: 0,Topic,Text,Category
0,SERVICE RESOURCES&lt;SRC> UNIT CUTS SORG&lt;SR...,"A group led by Chas. P. Young Co, a subsidiar...",trade
1,NO COMINCO STRIKE TALKS SCHEDULED,Cominco Ltd said no talks were scheduled with...,grain
2,SHIRMAX FASHIONS LTD &lt;SHX.M> 1ST QTR NET,qtr ended May 2 Oper shr three cts vs 11 ...,nat-gas
3,GATEWAY &lt;GMSI.O> RECINDS OFFER FOR WESTWORLD,Gateway Medical Systems Inc said it has withd...,rubber
4,UAE CENTRAL BANK CD YIELDS UNCHANGED,Yields on certificates of deposit (CDs) issue...,palm-oil


In [34]:
test_df["Combined_Text"] = test_df["Topic"] + " " + test_df["Text"]

In [35]:
test_df["Text"] = test_df["Combined_Text"].apply(preprocess_text)

  soup = BeautifulSoup(text, 'html.parser')


In [36]:
test_df = test_df.drop(['Combined_Text', 'Topic'], axis=1)

In [37]:
test_df.head()

Unnamed: 0,Text,Category
0,servic resourcessrc unit cut sorgsrg stake gro...,trade
1,cominco strike talk schedul cominco ltd said t...,grain
2,shirmax fashion ltd shxm 1st qtr net qtr end m...,nat-gas
3,gateway gmsio recind offer westworld gateway m...,rubber
4,uae central bank cd yield unchang yield certif...,palm-oil


In [39]:
counts = pd.Series(cats).value_counts()

In [45]:
selected_cats = ['acq',
 'crude',
 'earn',
 'interest',
 'money-fx',
 'trade']

In [46]:
test_df = test_df[test_df['Category'].isin(selected_cats)]

In [47]:
test_df.to_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_test_data.csv", index=False)