In [None]:
import pandas as pd
import re
import string


# NLTK for Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Downloading NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')  
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# NLTK Pre-processing Setup
stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Load the comments data from the checkpoint CSV file
all_comments = pd.read_csv('/work/GitHub_ML_Deepnote/Machine Learning/1. Extracted Reddit Data/>all_comments.csv', encoding='utf-8')

### Cleaning Comments

In [None]:
#Function to clean comment body
def clean_reddit_comment(text):
    # Removing links that might interfere with tokenization
    text = re.sub(r'http\S+', '', text)
    # Removing usernames to reduce noise
    text = re.sub(r'@[^\s]+', '', text)
    # Removing new lines
    text = text.replace('\n', ' ')
    #Removing "&#x200B;" character
    text = text.replace("&#x200B;", "")
    # Removing symbols or characters that might not contribute to the analysis.
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)

    # Check if text is empty after cleaning
    if not text.strip():
        return None
    
    return text

In [None]:
all_comments['cleaned_comment'] = all_comments['comment_body'].apply(clean_reddit_comment)

# Remove rows with empty or whitespace-only cleaned comments
cleaned_comments = all_comments.dropna(subset=['cleaned_comment']).copy()

# Display the cleaned DataFrame
print(all_comments[['comment_body', 'cleaned_comment']])

                                          comment_body  \
0    Perhaps this speaks to my naivety, but the pro...   
1    > that it would resent it's exploitation and p...   
2    “Once men turned their thinking over to machin...   
3    You're not wrong. We are in much more short-te...   
4    Im not really a finance person, but in my opin...   
..                                                 ...   
281  What are you talking about. You can't be serio...   
282  >I'm a data scentust and engineer.\n\nIf you'r...   
283  No you're a person that believes artificial in...   
284  >that believes artificial intelligence is aliv...   
285  Do you know what a query is? In data science? ...   

                                       cleaned_comment  
0    Perhaps this speaks to my naivety but the pros...  
1     that it would resent its exploitation and plo...  
2    Once men turned their thinking over to machine...  
3    Youre not wrong. We are in much more shortterm...  
4    Im not really

In [None]:
# Create a new DataFrame for cleaned comments
cleaned_comments = all_comments[['subreddit', 'post_title', 'post_sentiment', 'comment_id', 'parent_id', 'comment_author','comment_body','cleaned_comment', 'comment_score', 'comment_created_utc']]

# Specify the path for saving the cleaned comments CSV file
cleaned_comments_csv_path = '/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/cleaned_comments.csv'

# Save the cleaned comments DataFrame to a new CSV file
cleaned_comments.to_csv(cleaned_comments_csv_path, index=False, encoding='utf-8')


In [None]:
only_cleaned_comments = cleaned_comments[['cleaned_comment']].copy()

# Display the new DataFrame
print(only_cleaned_comments)

# Specify the path for saving the new cleaned comments CSV file
only_cleaned_comments_csv_path = '/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/only_cleaned_comments.csv'

# Save the new cleaned comments DataFrame to a new CSV file
only_cleaned_comments.to_csv(only_cleaned_comments_csv_path, index=False, encoding='utf-8')

                                       cleaned_comment
0    Perhaps this speaks to my naivety but the pros...
1     that it would resent its exploitation and plo...
2    Once men turned their thinking over to machine...
3    Youre not wrong. We are in much more shortterm...
4    Im not really a finance person but in my opini...
..                                                 ...
281  What are you talking about. You cant be seriou...
282  Im a data scentust and engineer.  If youre a d...
283  No youre a person that believes artificial int...
284  that believes artificial intelligence is alive...
285  Do you know what a query is In data science It...

[286 rows x 1 columns]


# Pre-Processing: Tokenisation & Lemmatisation 

In [None]:
# Creating a new dataframe to store tokenisation and lemmas
word_level_df = cleaned_comments.copy()

In [None]:
#Ensuring lemmatisation works correctly -- previously it assumed everything was a noun and didn't lemmatise words like "thinking" correctly.

# Function to get the POS tag for each token
def get_pos_tag(token):
    tag = pos_tag([token])[0][1][0].upper()
    tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
    return tag_dict.get(tag, 'n')  # Default to 'n' (noun) if the tag is not found

# Modified lemmatization function with POS tagging
def process_text_with_pos_tags(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stops and word not in punct]
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in words]
    return lemmatized_words


In [None]:
# Tokenisation and Lemmatization with a check for string-like objects

# Tokenization 
word_level_df['tokens'] = word_level_df['cleaned_comment'].apply(lambda x: [word.lower() for word in word_tokenize(str(x)) if word.lower() not in stops and word not in punct] if isinstance(x, (str, bytes)) else [])

# Lemmatization with POS tagging
word_level_df['lemmas'] = word_level_df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in tokens])

# Display the DataFrame
print(word_level_df[['cleaned_comment', 'tokens', 'lemmas']])

                                       cleaned_comment  \
0    Perhaps this speaks to my naivety but the pros...   
1     that it would resent its exploitation and plo...   
2    Once men turned their thinking over to machine...   
3    Youre not wrong. We are in much more shortterm...   
4    Im not really a finance person but in my opini...   
..                                                 ...   
281  What are you talking about. You cant be seriou...   
282  Im a data scentust and engineer.  If youre a d...   
283  No youre a person that believes artificial int...   
284  that believes artificial intelligence is alive...   
285  Do you know what a query is In data science It...   

                                                tokens  \
0    [perhaps, speaks, naivety, prospect, homicidal...   
1    [would, resent, exploitation, plot, overthrow,...   
2    [men, turned, thinking, machines, hope, would,...   
3    [youre, wrong, much, shortterm, danger, bad, h...   
4    [im, rea

### Saving tokens and lemmas as a list of lists

In [None]:
# Columns to clean
columns_to_clean = ['cleaned_comment', 'tokens', 'lemmas']

# Apply the cleaning operation to each column
for column in columns_to_clean:
    word_level_df[column] = word_level_df[column].apply(lambda x: [item.encode('utf-8', 'ignore').decode('utf-8') if isinstance(item, str) else item for item in x] if isinstance(x, list) else x)

In [None]:
#Re-arranging columns 
word_level_df = word_level_df[['subreddit', 'post_title', 'post_sentiment', 'comment_id', 'parent_id', 'comment_author', 'tokens', 'lemmas', 'comment_score', 'comment_created_utc']]

# Specifying the path for saving the preprocessed data CSV file
preprocessed_data_csv_path = '/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/word_level_df.csv'

# Savinb the preprocessed data DataFrame to a new CSV file
word_level_df.to_csv(preprocessed_data_csv_path, index=False, encoding='utf-8')

### Saving tokens and lemmas as a string

In [None]:
sentence_level_df = word_level_df.copy()

# Join up the 'tokens' and 'lemmas' columns and replace the original columns
sentence_level_df['tokens'] = sentence_level_df['tokens'].apply(lambda tokens: ' '.join(tokens))
sentence_level_df['lemmas'] = sentence_level_df['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

# Specify the path for saving the CSV file
csv_file_path = '/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/sentence_level_df.csv'

# Save the DataFrame to a CSV file
sentence_level_df.to_csv(csv_file_path, index=False, encoding='utf-8')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f64215d6-debc-46bd-b273-63565459a66d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>