In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
import unicodedata

warnings.filterwarnings("ignore")
TOKENIZERS_PARALLELISM=True


In [6]:
df = pd.read_csv("../the-reddit-climate-change-dataset-comments.csv")

In [7]:
# 1. Check for null values
print("Null values in each column:")
print(df.isnull().sum())

# 2. Check for duplicate rows
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# 3. Drop the unnecessary columns
df = df.drop(columns=['type', 'id', 'subreddit.id', 'subreddit.name','subreddit.nsfw', 'created_utc', 'permalink', 'score'])

Null values in each column:
type                  0
id                    0
subreddit.id          0
subreddit.name        0
subreddit.nsfw        0
created_utc           0
permalink             0
body                  0
sentiment         57131
score                 0
dtype: int64

Number of duplicate rows:
0


In [8]:
def remove_accented_chars(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

df['body'] = df['body'].apply(remove_accented_chars)


In [9]:
nltk.download('punkt_tab')
df['tokenized_body'] = df['body'].apply(lambda x: word_tokenize(str(x)))

df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,body,sentiment,tokenized_body
0,Yeah but what the above commenter is saying is...,0.5719,"[Yeah, but, what, the, above, commenter, is, s..."
1,Any comparison of efficiency between solar and...,-0.9877,"[Any, comparison, of, efficiency, between, sol..."
2,I'm honestly waiting for climate change and th...,-0.1143,"[I, 'm, honestly, waiting, for, climate, chang..."
3,Not just Sacramento. It's actually happening a...,0.0,"[Not, just, Sacramento, ., It, 's, actually, h..."
4,I think climate change tends to get some peopl...,0.6634,"[I, think, climate, change, tends, to, get, so..."


In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Map POS tag
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN,
                "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatization function
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

# Apply lemmatization directly (tokenized_body is already a list)
df['lemmatized_body'] = df['tokenized_body'].apply(lemmatize_tokens)

df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


Unnamed: 0,body,sentiment,tokenized_body,lemmatized_body
0,Yeah but what the above commenter is saying is...,0.5719,"[Yeah, but, what, the, above, commenter, is, s...","[Yeah, but, what, the, above, commenter, be, s..."
1,Any comparison of efficiency between solar and...,-0.9877,"[Any, comparison, of, efficiency, between, sol...","[Any, comparison, of, efficiency, between, sol..."
2,I'm honestly waiting for climate change and th...,-0.1143,"[I, 'm, honestly, waiting, for, climate, chang...","[I, 'm, honestly, wait, for, climate, change, ..."
3,Not just Sacramento. It's actually happening a...,0.0,"[Not, just, Sacramento, ., It, 's, actually, h...","[Not, just, Sacramento, ., It, 's, actually, h..."
4,I think climate change tends to get some peopl...,0.6634,"[I, think, climate, change, tends, to, get, so...","[I, think, climate, change, tends, to, get, so..."


In [14]:
df.to_csv("../lemmatized_comments.csv")

In [None]:
df.head()