In [2]:
#Libraries
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
from collections import Counter
import nltk
from collections import Counter, defaultdict
import math
import pandas as pd
from typing import List, Dict

In [3]:
#Punkt tokenizer, which is used for tokenizing
nltk.download('punkt')
nltk.download('punkt_tab')

# Download stopwords dataset
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def split_train_val(input_csv_path, val_size, random_state=42):

    data = pd.read_csv(input_csv_path)

    #length of data
    total_data_len = len(data)

    print(f"Total length before splitting : {total_data_len}")

    if total_data_len <= val_size:
        raise ValueError(f"Dataset has less samples than {val_size}. Can't proceed further \n Total Training set length = {total_data_len}")

    # Create random indices for validation set
    np.random.seed(random_state)
    val_index = np.random.choice(total_data_len, val_size, replace=False)

    # Split the dataset
    validation_data = data.iloc[val_index]
    training_data = data.drop(val_index)

    return training_data, validation_data




In [5]:
training_data, validation_data = split_train_val('/kaggle/input/original-dataset/train.csv',500)
print(f"Training set size: {len(training_data)}")
print(f"Validation set size: {len(validation_data)}")

Total length before splitting : 13879
Training set size: 13379
Validation set size: 500


In [6]:
#loading test data
input_test_data='/kaggle/input/original-dataset/test.csv'
test_data = pd.read_csv(input_test_data)
print(f"Test set size: {len(test_data)}")

Test set size: 100


In [28]:

def remove_punctuation(text):
    if not isinstance(text, str):
        return str(text)

    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

    # Sentence segmentation (keeps boundaries)
    sentences = sent_tokenize(text)

    processed_sentences = []

    for sentence in sentences:
        # Remove punctuation (except needed for sentence boundaries)
        sentence = re.sub(f"[{re.escape(string.punctuation)}]", " ", sentence)

        # Tokenize words
        words = word_tokenize(sentence)
        
    # Join sentences back into a single string
    return " ".join(words)


In [29]:
training_data['cleaned_text'] = training_data['text'].apply(remove_punctuation)
training_data['cleaned_title'] = training_data['title'].apply(remove_punctuation)

In [30]:
training_data['cleaned_title'][0]

'Port St Lucie Florida'

In [31]:
validation_data['cleaned_text'] = validation_data['text'].apply(remove_punctuation)
validation_data['cleaned_title'] = validation_data['title'].apply(remove_punctuation)

In [32]:
test_data['cleaned_text'] = test_data['text'].apply(remove_punctuation)
test_data['cleaned_title'] = test_data['title'].apply(remove_punctuation)

In [42]:
def remove_stopwords(text):
    text = str(text)

    # Define stopwords set
    stop_words = set(stopwords.words('english'))

    # Tokenize text into words
    tokens = re.findall(r"\w+", text)

    # Remove stopwords
    filtered_words = [word for word in tokens if word.lower() not in stop_words]

    # Join the words back into a single string
    return ' '.join(filtered_words)

In [43]:
training_data['stopwords_text'] = training_data['cleaned_text'].apply(remove_stopwords)
training_data['stopwords_title'] = training_data['cleaned_title'].apply(remove_stopwords)

In [44]:
training_data['stopwords_title'][0]

'Port St Lucie Florida'

In [45]:
validation_data['stopwords_text'] = validation_data['cleaned_text'].apply(remove_punctuation)
validation_data['stopwords_title'] = validation_data['cleaned_title'].apply(remove_punctuation)

In [46]:
test_data['stopwords_text'] = test_data['cleaned_text'].apply(remove_punctuation)
test_data['stopwords_title'] = test_data['cleaned_title'].apply(remove_punctuation)

In [47]:
def stem_text(text):
    """
    Performs Porter Stemming on the given text.
    """
    pstem = PorterStemmer()
    text = str(text)

    # Tokenize the text into words
    tokens = re.findall(r"\w+", text)

    # Perform stemming on each token
    stemmed_words = [pstem.stem(word) for word in tokens]

    return ' '.join(stemmed_words)


In [48]:
training_data['stem_text'] = training_data['stopwords_text'].apply(stem_text)
training_data['stem_title'] = training_data['stopwords_title'].apply(stem_text)

In [49]:
training_data['stem_title'][0]

'port st luci florida'

In [50]:
validation_data['stem_text'] = validation_data['stopwords_text'].apply(stem_text)
validation_data['stem_title'] = validation_data['stopwords_title'].apply(stem_text)

In [51]:
test_data['stem_text'] = test_data['stopwords_text'].apply(stem_text)
test_data['stem_title'] = test_data['stopwords_title'].apply(stem_text)

In [52]:
def lemmatize_text(text):
    """
    Performs WordNet Lemmatization on the given text.
    """
    lemmatizer = WordNetLemmatizer()
    text = str(text)

    # Tokenize the text into words
    tokens = re.findall(r"\w+", text)

    # Perform lemmatization on each token
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(lemmatized_words)


In [53]:
training_data['lem_text'] = training_data['stopwords_text'].apply(lemmatize_text)
training_data['lem_title'] = training_data['stopwords_title'].apply(lemmatize_text)

In [54]:
training_data['lem_title'][0] 

'Port St Lucie Florida'

In [55]:
test_data['lem_text'] = test_data['stopwords_text'].apply(lemmatize_text)
test_data['lem_title'] = test_data['stopwords_title'].apply(lemmatize_text)

In [56]:
validation_data['lem_text'] = validation_data['stopwords_text'].apply(lemmatize_text)
validation_data['lem_title'] = validation_data['stopwords_title'].apply(lemmatize_text)

In [57]:
# Save as csv file
training_data.to_csv('train_preprocessed.csv', index=False)
validation_data.to_csv('val_preprocessed.csv', index=False)
test_data.to_csv('test_preprocessed.csv', index=False)

In [None]:
# Load the CSV file
df = pd.read_csv('train_preprocessed.csv')

# Find the largest and smallest lengths in 'stem_title'
max_title_length = df['title'].str.len().max()
min_title_length = df['title'].str.len().min()

# Find the largest and smallest lengths in 'stem_text'
max_text_length = df['text'].str.len().max()
min_text_length = df['text'].str.len().min()

# Print the results
print(f"Largest length in 'title': {max_title_length}")
print(f"Smallest length in 'title': {min_title_length}")
print(f"Largest length in 'text': {max_text_length}")
print(f"Smallest length in 'text': {min_text_length}")


Largest length in 'title': 63
Smallest length in 'title': 1
Largest length in 'text': 296253
Smallest length in 'text': 320


In [None]:
# Load the CSV file
df = pd.read_csv('train_preprocessed.csv')

# Find the largest and smallest lengths in 'lem_title'
max_title_length = df['lem_title'].str.len().max()
min_title_length = df['lem_title'].str.len().min()

# Find the largest and smallest lengths in 'lem_text'
max_text_length = df['lem_text'].str.len().max()
min_text_length = df['lem_text'].str.len().min()

# Print the results
print(f"Largest length in 'lem_title': {max_title_length}")
print(f"Smallest length in 'lem_title': {min_title_length}")
print(f"Largest length in 'lem_text': {max_text_length}")
print(f"Smallest length in 'lem_text': {min_text_length}")

Largest length in 'lem_title': 58.0
Smallest length in 'lem_title': 1.0
Largest length in 'lem_text': 65429
Smallest length in 'lem_text': 5


In [None]:
# Load the CSV file
df = pd.read_csv('train_preprocessed.csv')

# Find the largest and smallest lengths in 'stem_title'
max_title_length = df['stem_title'].str.len().max()
min_title_length = df['stem_title'].str.len().min()

# Find the largest and smallest lengths in 'stem_text'
max_text_length = df['stem_text'].str.len().max()
min_text_length = df['stem_text'].str.len().min()

# Print the results
print(f"Largest length in 'stem_title': {max_title_length}")
print(f"Smallest length in 'stem_title': {min_title_length}")
print(f"Largest length in 'stem_text': {max_text_length}")
print(f"Smallest length in 'stem_text': {min_text_length}")

Largest length in 'stem_title': 55.0
Smallest length in 'stem_title': 1.0
Largest length in 'stem_text': 63165
Smallest length in 'stem_text': 4
