In [9]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
# Read data from JSONL file
data = []
with open("emotion_dataset.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return stemmed_tokens, lemmatized_tokens

# Custom loop of 100 iterations
for i in range(100):
    # Select a random entry from data
    entry = data[i % len(data)]

    text = entry["text"]
    print(i,"=> Original Text:", text)

    # Preprocess text
    stemmed_tokens, lemmatized_tokens = preprocess_text(text)

    print("Stemmed Tokens:", stemmed_tokens)
    print("Lemmatized Tokens:", lemmatized_tokens)
    print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0 => Original Text: i feel awful about it too because it s my job to get him in a position to succeed and it just didn t happen here
Stemmed Tokens: ['feel', 'aw', 'job', 'get', 'posit', 'succeed', 'happen']
Lemmatized Tokens: ['feel', 'awful', 'job', 'get', 'position', 'succeed', 'happen']

1 => Original Text: im alone i feel awful
Stemmed Tokens: ['im', 'alon', 'feel', 'aw']
Lemmatized Tokens: ['im', 'alone', 'feel', 'awful']

2 => Original Text: ive probably mentioned this before but i really do feel proud of myself for actually keeping up with my new years resolution of monthly and weekly goals
Stemmed Tokens: ['ive', 'probabl', 'mention', 'realli', 'feel', 'proud', 'actual', 'keep', 'new', 'year', 'resolut', 'monthli', 'weekli', 'goal']
Lemmatized Tokens: ['ive', 'probably', 'mentioned', 'really', 'feel', 'proud', 'actually', 'keeping', 'new', 'year', 'resolution', 'monthly', 'weekly', 'goal']

3 => Original Text: i was feeling a little low few days back
Stemmed Tokens: ['feel', '