In [34]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import nltk
import string
from tqdm import tqdm
import os

In [35]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nawee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nawee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
# Load data from CSV
df_training = pd.read_csv("./dataset_training.csv")
df_testing = pd.read_csv("./dataset_testing.csv")

In [37]:
def preprocess_text(text):

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove punctuation
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)

    # Convert to lowercase
    text = text.lower()

    # Perform Whitespace tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Perform stemming
    ps = PorterStemmer()
    tokens = [ps.stem(token) for token in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = " ".join(tokens)

    return processed_text

In [38]:
df_training

Unnamed: 0,Topic,Text,Category,Split,WordCount,TokenCount
0,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,Argentine grain board figures show crop regis...,wheat,training,186,268
1,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT,Champion Products Inc said its board of direc...,earn,training,57,60
2,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,Computer Terminal Systems Inc said it has com...,acq,training,207,236
3,COBANCO INC &lt;CBCO> YEAR NET,"Shr 34 cts vs 1.19 dlrs Net 807,000 vs 2,...",earn,training,50,54
4,OHIO MATTRESS &lt;OMT> MAY HAVE LOWER 1ST QTR NET,"Ohio Mattress Co said its first quarter, endi...",acq,training,130,146
...,...,...,...,...,...,...
6144,POEHL WARNS AGAINST FURTHER DOLLAR FALL,Bundesbank President Karl Otto Poehl said a w...,money-supply,training,452,502
6145,Bank of Japan buys dollars shortly after openi...,Bank of Japan buys dollars shortly after open...,money-fx,training,14,14
6146,BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING,The Bank of Japan bought a small amount of do...,money-fx,training,81,90
6147,SOUTH KOREAN WON FIXED AT 25-MONTH HIGH,THE BANK OF KOREA SAID IT FIXED THE MIDRATE O...,money-fx,training,54,61


In [39]:
df_training['Text'] = df_training['Topic'] + df_training['Text']
df_training = df_training.drop(['Topic'], axis=1)
df_training = df_training.drop(['Split'], axis=1)
df_training = df_training.drop(['WordCount'], axis=1)
df_training = df_training.drop(['TokenCount'], axis=1)
df_training

Unnamed: 0,Text,Category
0,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS ...,wheat
1,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT...,earn
2,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,acq
3,COBANCO INC &lt;CBCO> YEAR NET Shr 34 cts vs 1...,earn
4,OHIO MATTRESS &lt;OMT> MAY HAVE LOWER 1ST QTR ...,acq
...,...,...
6144,POEHL WARNS AGAINST FURTHER DOLLAR FALL Bundes...,money-supply
6145,Bank of Japan buys dollars shortly after openi...,money-fx
6146,BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPEN...,money-fx
6147,SOUTH KOREAN WON FIXED AT 25-MONTH HIGH THE BA...,money-fx


In [40]:
df_testing['Text'] = df_testing['Topic'] + df_testing['Text']
df_testing = df_testing.drop(['Topic'], axis=1)
df_testing = df_testing.drop(['Split'], axis=1)
df_testing = df_testing.drop(['WordCount'], axis=1)
df_testing = df_testing.drop(['TokenCount'], axis=1)
df_testing

Unnamed: 0,Text,Category
0,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,trade
1,AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...,ship
2,SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE F...,wheat
3,SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERG...,acq
4,BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TEND...,interest
...,...,...
2405,PHILIPPINE TRADE GAP WIDENS IN JANUARY-AUGUST ...,trade
2406,NEW ZEALAND IMPOSES SANCTIONS AGAINST FIJI New...,sugar
2407,"IRAN, SOVIET UNION TO SWAP CRUDE, REFINED PROD...",crude
2408,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH ...,acq


In [41]:
df_training['Text'] = df_training['Text'].apply(preprocess_text)
df_testing['Text'] = df_testing['Text'].apply(preprocess_text) 

  soup = BeautifulSoup(text, 'html.parser')


In [42]:
df_training

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq
...,...,...
6144,poehl warn dollar fall bundesbank presid karl ...,money-supply
6145,bank japan buy dollar shortli open around 1453...,money-fx
6146,bank japan interven soon tokyo open bank japan...,money-fx
6147,south korean fix 25month high bank korea said ...,money-fx


In [43]:
# Save cleaned and preprocessed data
df_training.to_csv("./preprocessed_training_data.csv", index=False)
df_testing.to_csv("./preprocessed_testing_data.csv", index=False)