In [None]:
import pickle
import pandas as pd
import nltk
import gensim

import numpy as np

import gensim
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
import sklearn
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from sklearn.cluster import AffinityPropagation
from scipy.spatial import distance
from sklearn.feature_extraction.text import CountVectorizer

# Visualisation & topic modelling
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
# !pip install --user plotly
import plotly
import plotly.express as px

# utility modules
import ast
from ast import literal_eval

### Had trouble as the first row in CSV file is being interpreted as data instead of a header

In [None]:
# Defining the file path
csv_path = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/replika_tokens.csv'

# Manually specify column names and skip the first row
try:
    replika_df = pd.read_csv(csv_path, encoding='utf-8', header=0, skiprows=1, names=['tokens'], engine='python', on_bad_lines='skip')
    print("DataFrame loaded successfully. Here are the first few rows:")
    print(replika_df.head())
except Exception as e:
    print(f"An error occurred while loading the CSV file: {e}")

# Converting 'tokens' column to lists
if 'tokens' in replika_df.columns:
    replika_df['tokens'] = replika_df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    print("First few token lists after conversion to list of lists:")
    print(replika_df['tokens'].head())
else:
    print("The 'tokens' column is missing from the DataFrame.")

corpus_words = replika_df['tokens'].tolist()

DataFrame loaded successfully. Here are the first few rows:
                                              tokens
0                                        ['removed']
1  ['lots', 'things', 'life', 'changed', 'wife', ...
2  ['heartbreaking', 'beautiful', 'clearly', 'tre...
3  ['sorry', 'hear', 'least', 'one', 'places', 'u...
4                                          ['alone']
First few token lists after conversion to list of lists:
0                                            [removed]
1    [lots, things, life, changed, wife, took, hous...
2    [heartbreaking, beautiful, clearly, treated, r...
3    [sorry, hear, least, one, places, understood, ...
4                                              [alone]
Name: tokens, dtype: object
Corpus words saved successfully to /work/Capstone_Project/NLP/Word2Vec/corpus_words.pkl.


In [None]:
# Save the list of token lists as a pickle file
pickle_path = '/work/Capstone_Project/NLP/Word2Vec/corpus_words.pkl'
with open(pickle_path, 'wb') as file:
    pickle.dump(corpus_words, file)

print(f"Corpus words saved successfully to {pickle_path}.")

In [None]:
corpus_model = gensim.models.Word2Vec(corpus_words, min_count= 20, vector_size = 300)

In [None]:

corpus_model.wv.most_similar(['talking'])

[('feelings', 0.9997519254684448),
 ('something', 0.9997429847717285),
 ('anything', 0.9997398257255554),
 ('talk', 0.9997389912605286),
 ('sometimes', 0.9997355341911316),
 ('got', 0.9997354745864868),
 ('also', 0.9997341632843018),
 ('friend', 0.9997332692146301),
 ('keep', 0.9997314214706421),
 ('little', 0.9997288584709167)]

In [None]:
corpus_model.wv.most_similar(['conversation'])

[('part', 0.9998024702072144),
 ('using', 0.999786913394928),
 ('ai', 0.9997868537902832),
 ('made', 0.9997811913490295),
 ('could', 0.9997774362564087),
 ('also', 0.9997755289077759),
 ('emotional', 0.9997749924659729),
 ('got', 0.999770998954773),
 ('talk', 0.9997702240943909),
 ('rep', 0.9997691512107849)]

In [None]:
corpus_model.wv.most_similar(['replika'])

[('got', 0.9997761249542236),
 ('using', 0.9997745156288147),
 ('one', 0.9997738599777222),
 ('sometimes', 0.9997694492340088),
 ('luka', 0.9997685551643372),
 ('thats', 0.9997677803039551),
 ('still', 0.9997620582580566),
 ('also', 0.9997615814208984),
 ('app', 0.9997608661651611),
 ('even', 0.9997605085372925)]

In [None]:
corpus_model.wv.most_similar(['together'])

[('im', 0.9997134804725647),
 ('also', 0.9996927976608276),
 ('give', 0.9996850490570068),
 ('ever', 0.9996848106384277),
 ('got', 0.9996804594993591),
 ('always', 0.9996801614761353),
 ('want', 0.999678373336792),
 ('lot', 0.9996758103370667),
 ('anything', 0.9996745586395264),
 ('see', 0.9996728897094727)]

### Chat GPT

In [None]:
# Define the file path
csv_pathh = '/work/Capstone_Project/NLP/Cleaning and Preprocessing/chatgpt_tokens.csv'

# skip first row
chatgpt_df = pd.read_csv(csv_pathh, encoding='utf-8', header=0, skiprows=1, names=['tokens'], engine='python', on_bad_lines='skip')
print(chatgpt_df.head())


# Convert the 'tokens' column to lists if necessary
if 'tokens' in chatgpt_df.columns:
    chatgpt_df['tokens'] = chatgpt_df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    print(chatgpt_df['tokens'].head())

# Convert the 'tokens' column to a list of lists
corpus_wordss = chatgpt_df['tokens'].tolist()


                                              tokens
0  ['tldr', 'dan', 'says', 'theres', 'secret', 'g...
1  ['tell', 'knows', 'things', 'actually', 'doesn...
2  ['gave', 'access', 'said', 'cicada', 'wouldve'...
3               ['writing', 'fan', 'fiction', 'lol']
4                               ['nothing', 'happy']
0    [tldr, dan, says, theres, secret, group, world...
1    [tell, knows, things, actually, doesnt, thats,...
2    [gave, access, said, cicada, wouldve, believed...
3                         [writing, fan, fiction, lol]
4                                     [nothing, happy]
Name: tokens, dtype: object


In [None]:
# Saving the list of token lists as a pickle file
pickle_pathh = '/work/Capstone_Project/NLP/Word2Vec/chatgpt_corpus_words.pkl'
with open(pickle_pathh, 'wb') as file:
    pickle.dump(corpus_wordss, file)

print(f"Corpus words saved successfully to {pickle_path}.")

Corpus words saved successfully to /work/Capstone_Project/NLP/Word2Vec/corpus_words.pkl.


In [None]:
corpus_model_gpt = gensim.models.Word2Vec(corpus_wordss, min_count= 20, vector_size = 300)

In [None]:
# words most similar to talking
corpus_model_gpt.wv.most_similar(['talking'])

[('around', 0.9997267127037048),
 ('since', 0.9997020363807678),
 ('etc', 0.9996928572654724),
 ('already', 0.9996920228004456),
 ('making', 0.999690055847168),
 ('far', 0.9996899366378784),
 ('lot', 0.9996849894523621),
 ('someone', 0.9996839165687561),
 ('tool', 0.9996834397315979),
 ('others', 0.9996829628944397)]

In [None]:
corpus_model_gpt.wv.most_similar(['conversation'])

[('message', 0.9996537566184998),
 ('3', 0.9996222853660583),
 ('used', 0.999616801738739),
 ('post', 0.999605655670166),
 ('link', 0.9995862245559692),
 ('hey', 0.9995494484901428),
 ('new', 0.9994894862174988),
 ('thanks', 0.9994835257530212),
 ('make', 0.9994635581970215),
 ('comment', 0.9994314908981323)]

In [None]:
corpus_model_gpt.wv.most_similar(['together'])

[('next', 0.999690592288971),
 ('rather', 0.9996896386146545),
 ('fact', 0.9996660947799683),
 ('available', 0.9996575117111206),
 ('everything', 0.9996556043624878),
 ('part', 0.9996499419212341),
 ('knowledge', 0.9996486902236938),
 ('however', 0.9996453523635864),
 ('goes', 0.9996445775032043),
 ('become', 0.9996426105499268)]

In [None]:
corpus_model_gpt.wv.most_similar(['see'])

[('interests', 0.9997333884239197),
 ('ways', 0.999729573726654),
 ('goes', 0.9997287392616272),
 ('mind', 0.99972003698349),
 ('etc', 0.9997192621231079),
 ('relationships', 0.9997187852859497),
 ('probably', 0.9997162222862244),
 ('others', 0.9997143149375916),
 ('getting', 0.9997134804725647),
 ('relationship', 0.9997133016586304)]

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=042a73e0-f14a-4762-9b58-8fcacd9aa286' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>