In [16]:

import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from tqdm import tqdm
tqdm.pandas()


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


pd.set_option('display.max_columns', None)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pkart\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pkart\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pkart\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pkart\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pkart\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
processed_path = "../data/processed/climate_nasa_cleaned.csv"
df = pd.read_csv(processed_path)

df.head()


Unnamed: 0,date,likesCount,profileName,commentsCount,text,text_length
0,2022-09-07 17:12:32+00:00,2,4dca617d86b3fdce80ba7e81fb16e048c9cd9798cdfd6d...,0.0,Neat comparison I have not heard it before.\n ...,173
1,2022-09-08 14:51:13+00:00,0,518ab97f2d115ba5b6f03b2fba2ef2b120540c9681288b...,0.0,An excellent way to visualise the invisible! T...,51
2,2022-09-07 17:19:41+00:00,1,d82e8e24eb633fd625b0aef9b3cb625cfb044ceb8483e1...,3.0,Does the CO2/ghg in the troposphere affect the...,108
3,2022-09-08 00:51:30+00:00,4,37a509fa0b5177a2233c7e2d0e2b2d6916695fa9fba3f2...,0.0,excellent post! I defo feel the difference - o...,94
4,2022-09-07 19:06:20+00:00,16,e54fbbd42a729af9d04d9a5cc1f9bbfe8081a31c219ecb...,26.0,"Yes, and carbon dioxide does not harm the Eart...",128


In [18]:

def clean_text(text):
   
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['clean_text'] = df['text'].progress_apply(clean_text)
df[['text', 'clean_text']].head()


100%|█████████████████████████████████████████████████████████████████████████████| 504/504 [00:00<00:00, 51888.30it/s]


Unnamed: 0,text,clean_text
0,Neat comparison I have not heard it before.\n ...,neat comparison i have not heard it before i w...
1,An excellent way to visualise the invisible! T...,an excellent way to visualise the invisible th...
2,Does the CO2/ghg in the troposphere affect the...,does the co2ghg in the troposphere affect the ...
3,excellent post! I defo feel the difference - o...,excellent post i defo feel the difference one ...
4,"Yes, and carbon dioxide does not harm the Eart...",yes and carbon dioxide does not harm the earth...


In [19]:

stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return tokens


df['tokens'] = df['clean_text'].progress_apply(tokenize_text)
df[['clean_text', 'tokens']].head()


100%|█████████████████████████████████████████████████████████████████████████████| 504/504 [00:00<00:00, 12745.03it/s]


Unnamed: 0,clean_text,tokens
0,neat comparison i have not heard it before i w...,"[neat, comparison, heard, would, say, co2, lik..."
1,an excellent way to visualise the invisible th...,"[excellent, way, visualise, invisible, thanks]"
2,does the co2ghg in the troposphere affect the ...,"[co2ghg, troposphere, affect, stratosphere, wa..."
3,excellent post i defo feel the difference one ...,"[excellent, post, defo, feel, difference, one,..."
4,yes and carbon dioxide does not harm the earth...,"[yes, carbon, dioxide, harm, earth, like, peop..."


In [20]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['lemmas'] = df['tokens'].progress_apply(lemmatize_tokens)
df[['tokens', 'lemmas']].head()


100%|█████████████████████████████████████████████████████████████████████████████| 504/504 [00:00<00:00, 19180.22it/s]


Unnamed: 0,tokens,lemmas
0,"[neat, comparison, heard, would, say, co2, lik...","[neat, comparison, heard, would, say, co2, lik..."
1,"[excellent, way, visualise, invisible, thanks]","[excellent, way, visualise, invisible, thanks]"
2,"[co2ghg, troposphere, affect, stratosphere, wa...","[co2ghg, troposphere, affect, stratosphere, wa..."
3,"[excellent, post, defo, feel, difference, one,...","[excellent, post, defo, feel, difference, one,..."
4,"[yes, carbon, dioxide, harm, earth, like, peop...","[yes, carbon, dioxide, harm, earth, like, peop..."


In [21]:
preprocessed_path = "../data/processed/climate_nasa_preprocessed.csv"
df.to_csv(preprocessed_path, index=False)
print(f"Preprocessed dataset saved to {preprocessed_path}")


Preprocessed dataset saved to ../data/processed/climate_nasa_preprocessed.csv
