In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
import unicodedata

warnings.filterwarnings("ignore")
TOKENIZERS_PARALLELISM=True

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("../the-reddit-climate-change-dataset-comments.csv")

: 

: 

In [None]:
def reduce_dataset_balanced(input_file, output_file, target_size=50000):
    """
    Reduce a dataset to a target size while maintaining balanced sentiment distribution.

    Args:
        input_file (str): Path to input CSV file
        output_file (str): Path to save reduced CSV file
        target_size (int): Desired number of records in output (default: 50000)
    """

    # Load the dataset
    df = pd.read_csv(input_file)

    # Categorize sentiment
    df['sentiment_category'] = np.where(
        df['sentiment'] < 0, 'negative',
        np.where(df['sentiment'] == 0, 'neutral', 'positive')
    )

    # Calculate target size for each category (equal distribution)
    category_size = target_size // 3

    # Sample from each category
    samples = []
    for category in ['negative', 'neutral', 'positive']:
        category_df = df[df['sentiment_category'] == category]

        # If category has fewer samples than needed, take all
        n_samples = min(category_size, len(category_df))

        # Random sample without replacement
        sample = category_df.sample(n=n_samples, random_state=42)
        samples.append(sample)

    # Combine samples
    reduced_df = pd.concat(samples)

    # If total is less than target due to rounding, sample more from largest category
    if len(reduced_df) < target_size:
        remaining = target_size - len(reduced_df)
        # Find largest category
        counts = reduced_df['sentiment_category'].value_counts()
        largest_category = counts.idxmax()
        # Get additional samples from largest category
        category_df = df[df['sentiment_category'] == largest_category]
        # Exclude already sampled rows
        category_df = category_df[~category_df.index.isin(reduced_df.index)]
        additional_samples = category_df.sample(n=remaining, random_state=42)
        reduced_df = pd.concat([reduced_df, additional_samples])

    # Remove temporary column and shuffle
    reduced_df = reduced_df.drop(columns=['sentiment_category'])
    reduced_df = reduced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save to CSV
    reduced_df.to_csv(output_file, index=False)
    print(f"Saved balanced dataset with {len(reduced_df)} records to {output_file}")
    print("Sentiment distribution:")
    print(reduced_df['sentiment'].apply(
        lambda x: 'negative' if x < 0 else 'neutral' if x == 0 else 'positive'
    ).value_counts())

reduce_dataset_balanced('../the-reddit-climate-change-dataset-comments.csv', '../reduced_dataset.csv', 50000)

Saved balanced dataset with 50000 records to ../reduced_dataset.csv
Sentiment distribution:
sentiment
negative    16668
positive    16666
neutral     16666
Name: count, dtype: int64


: 

: 

In [None]:
df = pd.read_csv("../reduced_dataset.csv")

: 

: 

In [None]:
 #1. Check for null values
print("Null values in each column:")
print(df.isnull().sum())

# 2. Check for duplicate rows
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# 3. Drop the unnecessary columns
df = df.drop(columns=['type', 'id', 'subreddit.id', 'subreddit.name','subreddit.nsfw', 'created_utc', 'permalink', 'score'])

Null values in each column:
type                0
id                  0
subreddit.id        0
subreddit.name      0
subreddit.nsfw      0
created_utc         0
permalink           0
body                0
sentiment         451
score               0
dtype: int64

Number of duplicate rows:
0


: 

: 

In [None]:
def remove_accented_chars(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_links(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_symbols(text):
    return re.sub(r'[^A-Za-z\s]', '', text)


df['body'] = df['body'].apply(remove_accented_chars)
df['body'] = df['body'].apply(remove_links)
df['body'] = df['body'].apply(remove_symbols)

#to check if links were removed
print("\nNumber of comments containing links:")
(df['body'].str.contains("http").sum())


Number of comments containing links:


0

: 

: 

In [None]:
#to check if symbols were removed
symbol_rows = df[df['body'].str.contains(r'[^A-Za-z\s]', regex=True)]
print(f"Number of rows with symbols in 'body': {len(symbol_rows)}")


Number of rows with symbols in 'body': 0


: 

: 

In [None]:
nltk.download('punkt_tab')
df['tokenized_body'] = df['body'].apply(lambda x: word_tokenize(str(x)))

df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\NJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,body,sentiment,tokenized_body
0,People need to do this kind of thing more ofte...,0.861,"[People, need, to, do, this, kind, of, thing, ..."
1,Thats cute if things dont reverse course in ou...,-0.1759,"[Thats, cute, if, things, dont, reverse, cours..."
2,Whats interesting is that you are arguing with...,-0.0896,"[Whats, interesting, is, that, you, are, argui..."
3,i can agree with that a lot of the media is ow...,-0.8176,"[i, can, agree, with, that, a, lot, of, the, m..."
4,The rising seas are due to global warming but ...,0.0772,"[The, rising, seas, are, due, to, global, warm..."


: 

: 

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Map POS tag
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN,
                "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatization function
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

# Apply lemmatization directly (tokenized_body is already a list)
df['lemmatized_body'] = df['tokenized_body'].apply(lemmatize_tokens)

df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NJ\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\NJ\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\NJ/nltk_data'
    - 'c:\\Users\\NJ\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\NJ\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\NJ\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\NJ\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


: 

: 

In [None]:
df = df.drop(columns=['tokenized_body'])

: 

: 

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load the stopwords
STOPWORDS = set(stopwords.words('english'))

# Contraction expansion mapping
CONTRACTIONS = {
    r"won't": "will not",
    r"can't": "can not",
    r"n't": " not",  # general case (doesn't → does not)
    r"'re": " are",
    r"'s": " is",    # possessive 's → " is" (optional: keep as "'s" removal only)
    r"'d": " would",
    r"'ll": " will",
    r"'ve": " have",
    r"'m": " am"
}

def expand_contractions(token):
    """Expand contractions in a token (e.g., "won't" → "will not")"""
    for contraction, expansion in CONTRACTIONS.items():
        token = re.sub(contraction, expansion, token)
    return token

def clean_and_remove_stopwords(token_list):
    """
    Custom function to:
    1. Expand contractions (e.g., "won't" → "will not")
    2. Remove non-word tokens (symbols, punctuation)
    3. Remove stopwords and possessive 's
    """
    cleaned_tokens = []
    for token in token_list:
        # Skip if the token is not alphabetic (allows apostrophes for contractions)
        if not re.fullmatch(r"[a-zA-Z'-]+", str(token)):
            continue

        # Step 1: Expand contractions (e.g., "I'm" → "I am")
        token = expand_contractions(token)

        # Step 2: Split into subtokens if contraction expansion added spaces (e.g., "will not" → ["will", "not"])
        subtokens = token.split()

        for subtoken in subtokens:
            # Remove possessive 's (e.g., "world's" → "world")
            subtoken = re.sub(r"'s$", "", subtoken)

            # Skip if subtoken is empty after processing
            if not subtoken:
                continue

            # Convert to lowercase and check if it's a stopword
            subtoken_lower = subtoken.lower()
            if subtoken_lower not in STOPWORDS:
                cleaned_tokens.append(subtoken)

    return cleaned_tokens

df['lemmatized_body'] = df['lemmatized_body'].apply(clean_and_remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


: 

: 

In [None]:
df.head()

Unnamed: 0,body,sentiment,lemmatized_body
0,People need to do this kind of thing more ofte...,0.861,"[People, need, kind, thing, often, obviously, ..."
1,That's cute; if things don't reverse course in...,-0.1759,"[cute, thing, reverse, course, world, climate,..."
2,What's interesting is that you are arguing wit...,-0.0896,"[interest, argue, statement, almost, scientist..."
3,i can agree with that. a lot of the media is o...,-0.8176,"[agree, lot, medium, large, part, billionaire,..."
4,"The rising seas are due to global warming, but...",0.0772,"[rise, sea, due, global, warm, flood, probably..."


: 

: 

In [None]:
# Convert string representation of list to actual list if needed
if isinstance(df['lemmatized_body'].iloc[0], str):
    df['lemmatized_body'] = df['lemmatized_body'].apply(ast.literal_eval)

# Convert all words in lemmatized_body lists to lowercase
df['lemmatized_body'] = df['lemmatized_body'].apply(lambda lst: [word.lower() for word in lst])

# Rename lemmatized_body to preprocessed_body
df = df.rename(columns={'lemmatized_body': 'preprocessed_body'})

# Reorder columns to put body first
df = df[['body', 'preprocessed_body', 'sentiment']]

# Save the processed DataFrame back to CSV to be used for training
df.to_csv('../preprocessed.csv', index=False)

: 

: 

In [None]:
df.head()

Unnamed: 0,body,preprocessed_body,sentiment
0,People need to do this kind of thing more ofte...,"[people, need, kind, thing, often, obviously, ...",0.861
1,That's cute; if things don't reverse course in...,"[cute, thing, reverse, course, world, climate,...",-0.1759
2,What's interesting is that you are arguing wit...,"[interest, argue, statement, almost, scientist...",-0.0896
3,i can agree with that. a lot of the media is o...,"[agree, lot, medium, large, part, billionaire,...",-0.8176
4,"The rising seas are due to global warming, but...","[rise, sea, due, global, warm, flood, probably...",0.0772


: 

: 