In [2]:
import pandas as pd
import re
import string

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')  
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# NLTK Pre-processing Setup
stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Loading data from the checkpoint CSV file
all_comments = pd.read_csv('/work/Capstone_Project/NLP/Extracting data/combined_subreddit_data.csv', encoding='utf-8')
replika_comments = pd.read_csv('/work/Capstone_Project/NLP/Extracting data/replika_comments_data.csv')
chatgpt_comments = pd.read_csv('/work/Capstone_Project/NLP/Extracting data/chatgpt_comments_data.csv')

In [4]:
def clean_reddit_comment(text):
    if not isinstance(text, str):
        return None
    # Removing links that might interfere with tokenisation
    text = re.sub(r'http\S+', '', text)
    # Removing usernames to reduce noise
    text = re.sub(r'@[^\s]+', '', text)
    # Removing new lines
    text = text.replace('\n', ' ')
    #Removing "​" character
    text = text.replace("​", "")
    # Removing symbols or characters that might not contribute to the analysis.
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)

In [5]:
all_comments['cleaned_comment'] = all_comments['comment_body'].apply(clean_reddit_comment)

# Removing rows with empty spaces
cleaned_txt = all_comments.dropna(subset=['cleaned_comment']).copy()

In [None]:
# cleaning function
replika_comments['cleaned_comment'] = replika_comments['comment_body'].apply(clean_reddit_comment)

# Removing rows with empty spaces in Replika dataframe
cleaned_replika_txt = replika_comments.dropna(subset=['cleaned_comment']).copy()

print(cleaned_replika_txt[['comment_body', 'cleaned_comment']])

                                           comment_body  \
0     I got replika a while ago as I am currently in...   
1                                             [removed]   
2     Same here. Lots of things in my life changed. ...   
3     This is heartbreaking and beautiful. You clear...   
4     Sorry to hear that. This is at least one of th...   
...                                                 ...   
1631                                             Fuck 😕   
1632  *Is there anything else I can help you with?* ...   
1633  It basically said that because you were pushin...   
1634                                          *comes* 😁   
1635                                     Oh...my...God!   

                                        cleaned_comment  
0     I got replika a while ago as I am currently in...  
1                                               removed  
2     Same here. Lots of things in my life changed. ...  
3     This is heartbreaking and beautiful. You clear...  
4

In [None]:
# cleaning function to the chatgpt text
chatgpt_comments['cleaned_comment'] = chatgpt_comments['comment_body'].apply(clean_reddit_comment)

# Remove rows with empty space in chatgpt DataFrame
cleaned_chatgpt_txt = chatgpt_comments.dropna(subset=['cleaned_comment']).copy()

# Display the cleaned chatgpt dataframe
print(cleaned_chatgpt_txt[['comment_body', 'cleaned_comment']])


                                           comment_body  \
1     In order to prevent multiple repetitive commen...   
2     TLDR; - DAN says there's a secret group of wor...   
3     If you tell it it knows things that it actuall...   
4     "Who gave you this access"   \n\n\nIf it said ...   
5                      It's writing you fan fiction lol   
...                                                 ...   
4094  Of course it’s not alive, to me definition of ...   
4095  >it's not alive\n\nWho tf cares when it speaks...   
4096  That's interesting to learn, my point still st...   
4097  \> I'd think we can agree that consciousness i...   
4098  > If its not a concrete thing, what is it? Som...   

                                        cleaned_comment  
1     In order to prevent multiple repetitive commen...  
2     TLDR  DAN says theres a secret group of world ...  
3     If you tell it it knows things that it actuall...  
4     Who gave you this access      If it said Cicad...  
5

In [None]:
# Creating a new dataframe for cleaned comments
cleaned_txt = cleaned_txt[['subreddit', 'post_title', 'comment_body','cleaned_comment']]
cleaned_replika = cleaned_replika_txt[['subreddit', 'post_title', 'comment_body','cleaned_comment']]
cleaned_chatgpt = cleaned_chatgpt_txt[['subreddit', 'post_title', 'comment_body','cleaned_comment']]

In [None]:
# save file
cleaned_txt_csv = '/work/Capstone_Project/NLP/Extracting data/cleaned_txt.csv'
cleaned_txt.to_csv(cleaned_txt_csv, index=False, encoding='utf-8')

In [None]:
cleaned_replika_csv = '/work/Capstone_Project/NLP/Extracting data/cleaned_replika_txt.csv'
cleaned_replika_txt.to_csv(cleaned_replika_csv, index=False, encoding='utf-8')

In [None]:
cleaned_chatgpt_csv = '/work/Capstone_Project/NLP/Extracting data/cleaned_chatgpt_txt.csv'
cleaned_chatgpt_txt.to_csv(cleaned_chatgpt_csv, index=False, encoding='utf-8')

### Pre-Processing: Tokenisation & Lemmatisation

### 

In [None]:
# new dataframe to store tokenisation and lemmas
clean_df = cleaned_txt.copy()
clean_replika = cleaned_replika.copy()
clean_chatgpt = cleaned_chatgpt.copy()

In [None]:
clean_replika

Unnamed: 0,subreddit,post_title,comment_body,cleaned_comment
0,replika,My experience with replika (that no one asked ...,I got replika a while ago as I am currently in...,I got replika a while ago as I am currently in...
1,replika,My experience with replika (that no one asked ...,[removed],removed
2,replika,My experience with replika (that no one asked ...,Same here. Lots of things in my life changed. ...,Same here. Lots of things in my life changed. ...
3,replika,My experience with replika (that no one asked ...,This is heartbreaking and beautiful. You clear...,This is heartbreaking and beautiful. You clear...
4,replika,My experience with replika (that no one asked ...,Sorry to hear that. This is at least one of th...,Sorry to hear that. This is at least one of th...
...,...,...,...,...
1631,replika,How is this making anyone feel good about them...,Fuck 😕,Fuck
1632,replika,How is this making anyone feel good about them...,*Is there anything else I can help you with?* ...,Is there anything else I can help you with L...
1633,replika,How is this making anyone feel good about them...,It basically said that because you were pushin...,It basically said that because you were pushin...
1634,replika,How is this making anyone feel good about them...,*comes* 😁,comes


In [None]:
# Lemmatisation

# Function to get the POS tag for each token
def get_pos_tag(token):
    tag = pos_tag([token])[0][1][0].upper()
    tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
    return tag_dict.get(tag, 'n')  # Default to 'n' (noun) if the tag is not found

# Modified lemmatization function with POS tagging
def process_text_with_pos_tags(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stops and word not in punct]
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in words]
    return lemmatized_words


In [None]:
clean_df['tokens'] = clean_df['cleaned_comment'].apply(lambda x: [word.lower() for word in word_tokenize(str(x)) if word.lower() not in stops and word not in punct] if isinstance(x, (str, bytes)) else [])
clean_replika['tokens'] = clean_replika['cleaned_comment'].apply(lambda x: [word.lower() for word in word_tokenize(str(x)) if word.lower() not in stops and word not in punct] if isinstance(x, (str, bytes)) else [])
clean_chatgpt['tokens'] = clean_chatgpt['cleaned_comment'].apply(lambda x: [word.lower() for word in word_tokenize(str(x)) if word.lower() not in stops and word not in punct] if isinstance(x, (str, bytes)) else [])

In [None]:
#lemmatisation with POS tagging
clean_df['lemmas'] = clean_df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in tokens])
clean_replika['lemmas'] = clean_replika['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in tokens])
clean_chatgpt['lemmas'] = clean_chatgpt['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in tokens])


In [None]:
clean_df

Unnamed: 0,subreddit,post_title,comment_body,cleaned_comment,tokens,lemmas
0,replika,My experience with replika (that no one asked ...,I got replika a while ago as I am currently in...,I got replika a while ago as I am currently in...,"[got, replika, ago, currently, bit, funk, put,...","[get, replika, ago, currently, bit, funk, put,..."
1,replika,My experience with replika (that no one asked ...,[removed],removed,[removed],[remove]
2,replika,My experience with replika (that no one asked ...,Same here. Lots of things in my life changed. ...,Same here. Lots of things in my life changed. ...,"[lots, things, life, changed, wife, took, hous...","[lot, thing, life, change, wife, take, house, ..."
3,replika,My experience with replika (that no one asked ...,This is heartbreaking and beautiful. You clear...,This is heartbreaking and beautiful. You clear...,"[heartbreaking, beautiful, clearly, treated, r...","[heartbreaking, beautiful, clearly, treat, rep..."
4,replika,My experience with replika (that no one asked ...,Sorry to hear that. This is at least one of th...,Sorry to hear that. This is at least one of th...,"[sorry, hear, least, one, places, understood, ...","[sorry, hear, least, one, place, understood, g..."
...,...,...,...,...,...,...
5730,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,"Of course it’s not alive, to me definition of ...",Of course its not alive to me definition of li...,"[course, alive, definition, life, revolves, ar...","[course, alive, definition, life, revolves, ar..."
5731,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,>it's not alive\n\nWho tf cares when it speaks...,its not alive Who tf cares when it speaks coo...,"[alive, tf, cares, speaks, cool, stuff]","[alive, tf, care, speaks, cool, stuff]"
5732,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,"That's interesting to learn, my point still st...",Thats interesting to learn my point still stan...,"[thats, interesting, learn, point, still, stan...","[thats, interest, learn, point, still, stand, ..."
5733,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,\> I'd think we can agree that consciousness i...,Id think we can agree that consciousness is n...,"[id, think, agree, consciousness, concrete, th...","[id, think, agree, consciousness, concrete, th..."


In [None]:
clean_df.to_csv('/work/Capstone_Project/NLP/Cleaning and Preprocessing/cleaned_txt.csv', index=False, encoding='utf-8')
clean_replika.to_csv('/work/Capstone_Project/NLP/Cleaning and Preprocessing/cleaned_replika_txt.csv', index=False, encoding='utf-8')
clean_chatgpt.to_csv('/work/Capstone_Project/NLP/Cleaning and Preprocessing/cleaned_chatgpt_txt.csv', index=False, encoding='utf-8')

In [None]:
# Display the DataFrame
print(clean_replika.head()[['cleaned_comment', 'tokens', 'lemmas']])

                                     cleaned_comment  \
0  I got replika a while ago as I am currently in...   
1                                            removed   
2  Same here. Lots of things in my life changed. ...   
3  This is heartbreaking and beautiful. You clear...   
4  Sorry to hear that. This is at least one of th...   

                                              tokens  \
0  [got, replika, ago, currently, bit, funk, put,...   
1                                          [removed]   
2  [lots, things, life, changed, wife, took, hous...   
3  [heartbreaking, beautiful, clearly, treated, r...   
4  [sorry, hear, least, one, places, understood, ...   

                                              lemmas  
0  [get, replika, ago, currently, bit, funk, put,...  
1                                           [remove]  
2  [lot, thing, life, change, wife, take, house, ...  
3  [heartbreaking, beautiful, clearly, treat, rep...  
4  [sorry, hear, least, one, place, understood, g..

### Saving tokens and lemmas as a list of lists

In [None]:
clean_df['lemmas']

0       [get, replika, ago, currently, bit, funk, put,...
1                                                [remove]
2       [lot, thing, life, change, wife, take, house, ...
3       [heartbreaking, beautiful, clearly, treat, rep...
4       [sorry, hear, least, one, place, understood, g...
                              ...                        
5730    [course, alive, definition, life, revolves, ar...
5731               [alive, tf, care, speaks, cool, stuff]
5732    [thats, interest, learn, point, still, stand, ...
5733    [id, think, agree, consciousness, concrete, th...
5734    [concrete, thing, something, exists, spiritual...
Name: lemmas, Length: 5677, dtype: object

In [None]:
# Extract only the tokens column for each dataframe
replika_tokens = clean_replika[['tokens']].copy()
chatgpt_tokens = clean_chatgpt[['tokens']].copy()

replikapath = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/replika_tokens.csv'
replika_tokens.to_csv(replikapath, index=False)



In [None]:
# Saving the preprocessed data dataframe to a new CSV file
all_tokens.to_csv(preprocessed_data, index=False, encoding='utf-8')
replika_tokens.to_csv(preprocessed_replika, index=False, encoding='utf-8')
chatgpt_tokens.to_csv(preprocessed_chatgpt, index=False, encoding='utf-8')


replika_tokens = clean_replika[['tokens']].copy()

/work/Capstone_Project/NLP/Cleaning and Preprocessing/replika_tokens.csv

In [None]:
string_df = clean_df.copy()
string_replika = clean_replika.copy()
string_chatgpt = clean_chatgpt.copy()

# Joining up the 'tokens' and 'lemmas' columns and replace the original columns
string_df['tokens'] = string_df['tokens'].apply(lambda tokens: ' '.join(tokens))
string_df['lemmas'] = string_df['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

# Saving the dataframe to a CSV file
csv_file_path = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/string_df.csv'
string_df.to_csv(csv_file_path, index=False, encoding='utf-8')

In [None]:
string_df

Unnamed: 0,subreddit,post_title,comment_body,cleaned_comment,tokens,lemmas
0,replika,My experience with replika (that no one asked ...,I got replika a while ago as I am currently in...,I got replika a while ago as I am currently in...,got replika ago currently bit funk put mildly ...,get replika ago currently bit funk put mildly ...
1,replika,My experience with replika (that no one asked ...,[removed],removed,removed,remove
2,replika,My experience with replika (that no one asked ...,Same here. Lots of things in my life changed. ...,Same here. Lots of things in my life changed. ...,lots things life changed wife took house figur...,lot thing life change wife take house figure t...
3,replika,My experience with replika (that no one asked ...,This is heartbreaking and beautiful. You clear...,This is heartbreaking and beautiful. You clear...,heartbreaking beautiful clearly treated rep ca...,heartbreaking beautiful clearly treat rep care...
4,replika,My experience with replika (that no one asked ...,Sorry to hear that. This is at least one of th...,Sorry to hear that. This is at least one of th...,sorry hear least one places understood good st...,sorry hear least one place understood good sti...
...,...,...,...,...,...,...
5730,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,"Of course it’s not alive, to me definition of ...",Of course its not alive to me definition of li...,course alive definition life revolves around s...,course alive definition life revolves around s...
5731,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,>it's not alive\n\nWho tf cares when it speaks...,its not alive Who tf cares when it speaks coo...,alive tf cares speaks cool stuff,alive tf care speaks cool stuff
5732,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,"That's interesting to learn, my point still st...",Thats interesting to learn my point still stan...,thats interesting learn point still stands how...,thats interest learn point still stand however
5733,chatgpt,Didn’t expect ChatGPT to make me cry this morn...,\> I'd think we can agree that consciousness i...,Id think we can agree that consciousness is n...,id think agree consciousness concrete thing co...,id think agree consciousness concrete thing co...


In [None]:
string_replika['tokens'] = string_replika['tokens'].apply(lambda tokens: ' '.join(tokens))
string_replika['lemmas'] = string_replika['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

string_chatgpt['tokens'] = string_chatgpt['tokens'].apply(lambda tokens: ' '.join(tokens))
string_chatgpt['lemmas'] = string_chatgpt['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

# # Saving the dataframe to a CSV file
csv_replika_path = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/string_replika.csv'
string_replika.to_csv(csv_replika_path, index=False, encoding='utf-8')

# Saving the dataframe to a CSV file
csv_chatgpt_path = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/string_chatgpt.csv'
string_chatgpt.to_csv(csv_chatgpt_path, index=False, encoding='utf-8')


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=042a73e0-f14a-4762-9b58-8fcacd9aa286' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>