In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import nltk
import string

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Load data from CSV
df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/dataset.csv")

In [5]:
def preprocess_text(text):

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove punctuation
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)

    # Convert to lowercase
    text = text.lower()

    # Perform Whitespace tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Perform stemming
    ps = PorterStemmer()
    tokens = [ps.stem(token) for token in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = " ".join(tokens)

    return processed_text

In [6]:
# Print original data
print("Original data:")
print(df)

Original data:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0      Mounting trade friction between the U.S. And ...         trade   
1      Tug crews in New South Wales (NSW), Victoria ...          ship   
2      Food Department officials said the U.S. Depar...  

In [9]:
# Clean and preprocess text
df["Text"] = df["Text"].apply(preprocess_text)

In [10]:
print("After HTML removal:")
print(df)

After HTML removal:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricultur ap

In [11]:
print("After lowercase:")
print(df)

After lowercase:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricultur ap...

In [12]:
print("After punctuation removal:")
print(df)

After punctuation removal:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricu

In [13]:
print("After stopword removal:")
print(df)

After stopword removal:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricultu

In [14]:
print("After stemming:")
print(df)

After stemming:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricultur ap... 

In [15]:
print("After lemmatization:")
print(df)

After lemmatization:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricultur a

In [16]:
print("After removing whitespace:")
print(df)

After removing whitespace:
                                                  Topic  \
0      ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT   
1     AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...   
2          SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE   
3      SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER   
4      BUNDESBANK ALLOCATES 6.1 BILLION MARKS IN TENDER   
...                                                 ...   
8554            POEHL WARNS AGAINST FURTHER DOLLAR FALL   
8555  Bank of Japan buys dollars shortly after openi...   
8556  BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING   
8557            SOUTH KOREAN WON FIXED AT 25-MONTH HIGH   
8558   AUSTRALIAN UNIONS LAUNCH NEW SOUTH WALES STRIKES   

                                                   Text      Category  \
0     mount trade friction u japan rai fear among ma...         trade   
1     tug crew new south wale nsw victoria western a...          ship   
2     food depart offici said u depart agricu

In [17]:
# Save cleaned and preprocessed data
df.to_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_data.csv", index=False)