In [38]:
#Importing Packages

# Importing Pandas for data manipulation and analysis
import pandas as pd

# Importing NumPy for numerical operations
import numpy as np

# Importing Matplotlib for data visualization
import matplotlib.pyplot as plt

# Importing Seaborn for advanced data visualization
import seaborn as sns

# Importing IPython.display for displaying rich media in Jupyter Notebooks
from IPython.display import display, Image

import nltk
from nltk.corpus import stopwords


<h3 style=\"color:green;\">DATA LOADING</h3>
    
    Firstly, import the necessary packages. When the required packages are imported, for loading a CSV file, this Project will utilise Pandas to load the data.

In [39]:
# Load the CSV file
train_df = pd.read_csv('Data/processed/train.csv')
test_df = pd.read_csv('Data/processed/test.csv')

In [40]:
train_df.head()

Unnamed: 0,headlines,description,content,url,category
0,RBI revises definition of politically-exposed ...,The central bank has also asked chairpersons a...,The Reserve Bank of India (RBI) has changed th...,https://indianexpress.com/article/business/ban...,business
1,NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...,NDTV's consolidated revenue from operations wa...,Broadcaster New Delhi Television Ltd on Monday...,https://indianexpress.com/article/business/com...,business
2,"Akasa Air ‘well capitalised’, can grow much fa...",The initial share sale will be open for public...,Homegrown server maker Netweb Technologies Ind...,https://indianexpress.com/article/business/mar...,business
3,India’s current account deficit declines sharp...,The current account deficit (CAD) was 3.8 per ...,India’s current account deficit declined sharp...,https://indianexpress.com/article/business/eco...,business
4,"States borrowing cost soars to 7.68%, highest ...",The prices shot up reflecting the overall high...,States have been forced to pay through their n...,https://indianexpress.com/article/business/eco...,business


<h3 style=\"color:green;\">PRE-PROCESSING</h3>
    
- Remove unnecessary columns (url) since it may not contribute to categorization.
- *Text Cleaning*: Convert text to lowercase, remove punctuation, stopwords, and special characters.
- *Tokenization*: Split text into individual words or phrases.
- *Vectorization*: Convert text into numerical form using TF-IDF or CountVectorizer.

In [41]:
import re


# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#Remove unnecessary columns
#train_df = train_df.drop('url', axis=1)
#test_df = test_df.drop('url', axis=1)

#Text Cleaning
#Convert text to lowercase
#def clean_text(text):
    # Remove punctuation and special characters
    #text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    #text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    #return ' '.join(text)

#def preprocess2(v):
    # v = v.str.lower().str.replace('[^a-z\s]', '').str.split(expand=True)

    # return v.where(~v.isin(stopwords) & v.notnull(), '')\
             #.agg(' '.join, axis=1)\
             #.str.replace('\s+', ' ')\
             #.str.strip()

#c = ['headlines', 'description', 'content'] # columns to operate
#cc = ['cleaned_headlines', 'cleaned_description', 'cleaned_content'] # columns to operate
#train_df[c] = train_df[c].apply(preprocess2, axis=0)


# Function to clean text
def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove numbers from the text
    text = re.sub(r'\d+', ' ', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Apply the function to the DataFrame
train_df['cleaned_headlines'] = train_df['headlines'].apply(clean_text)
train_df['cleaned_description'] = train_df['description'].apply(clean_text)
train_df['cleaned_content'] = train_df['content'].apply(clean_text)
train_df = train_df.drop('content', axis=1)
train_df = train_df.drop('description', axis=1)
train_df = train_df.drop('headlines', axis=1)
display(train_df['cleaned_headlines'].iloc[0])


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'RBI revises definition politically exposed persons KYC purpose'

In [37]:
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    return word_tokenize(text)

# Apply the function to the DataFrame
train_df['tokenized_headlines'] = train_df['cleaned_headlines'].apply(tokenize_text)
train_df['tokenized_description'] = train_df['cleaned_description'].apply(tokenize_text)
train_df['tokenized_content'] = train_df['cleaned_content'].apply(tokenize_text)
train_df = train_df.drop('cleaned_content', axis=1)
train_df = train_df.drop('cleaned_description', axis=1)
train_df = train_df.drop('cleaned_headlines', axis=1)

display(train_df)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...


[nltk_data]   Unzipping tokenizers/punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/codespace/nltk_data'
    - '/home/codespace/.python/current/nltk_data'
    - '/home/codespace/.python/current/share/nltk_data'
    - '/home/codespace/.python/current/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [8]:
train_df.head()
test_df.head()

Unnamed: 0,headlines,description,content,category
0,NLC India wins contract for power supply to Ra...,State-owned firm NLC India Ltd (NLCIL) on Mond...,State-owned firm NLC India Ltd (NLCIL) on Mond...,business
1,SBI Clerk prelims exams dates announced; admit...,SBI Clerk Prelims Exam: The SBI Clerk prelims ...,SBI Clerk Prelims Exam: The State Bank of Indi...,education
2,"Golden Globes: Michelle Yeoh, Will Ferrell, An...","Barbie is the top nominee this year, followed ...","Michelle Yeoh, Will Ferrell, Angela Bassett an...",entertainment
3,"OnePlus Nord 3 at Rs 27,999 as part of new pri...",New deal makes the OnePlus Nord 3 an easy purc...,"In our review of the OnePlus Nord 3 5G, we pra...",technology
4,Adani family’s partners used ‘opaque’ funds to...,Citing review of files from multiple tax haven...,Millions of dollars were invested in some publ...,business
