Import libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Upload the data
Load the data from the uploaded CSV file into a Pandas DataFrame.

In [2]:
from google.colab import files

uploaded = files.upload()

Saving reddit_data.csv to reddit_data.csv


Load Data
Load the data from the uploaded CSV file into a Pandas DataFrame.

In [3]:
import io

# Replace 'reddit_comments.csv' with the exact name of your uploaded file if different
data_path = list(uploaded.keys())[0]  # Get the name of the uploaded file
df = pd.read_csv(io.BytesIO(uploaded[data_path]))

# Display the first few rows of the dataframe to verify loading
df.head()


Unnamed: 0,Title,Comment
0,Pulsechain is dead!!,"Prove yourself right.\n\n48,000+ validators\n\..."
1,Pulsechain is dead!!,"Idk what you're talking about, eco is thriving"
2,Pulsechain is dead!!,Didnt people say the same for HEX ? Its not a ...
3,Pulsechain is dead!!,Low quality post. It is in the top 20 chains i...
4,Pulsechain is dead!!,"No. Please, sell. You're not worth our time."


Data Cleaning.
Cleaning the text data by normalizing, removing special characters, punctuation, URLs, emails, duplicates, and converting text to lowercase.

In [4]:
# Clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # remove emails
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters and numbers
    text = text.lower()  # lowercase text
    text = text.strip()  # remove leading/trailing whitespace
    return text

# Remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Preprocess text
df['Comment'] = df['Comment'].astype(str).apply(clean_text).apply(remove_stopwords)

# Drop duplicates
df = df.drop_duplicates(subset='Comment')

# Display the first few rows of the cleaned dataframe
df.head()


Unnamed: 0,Title,Comment
0,Pulsechain is dead!!,prove right validators active unique wallets h...
1,Pulsechain is dead!!,idk youre talking eco thriving
2,Pulsechain is dead!!,didnt people say hex bull market right hold ra...
3,Pulsechain is dead!!,low quality post top chains tvl h vol atm
4,Pulsechain is dead!!,please sell youre worth time


In [6]:
# Print columns of the DataFrame to check for 'cleaned_comment'
print(df.columns)

Index(['Title', 'Comment'], dtype='object')


In [9]:
# Assuming you want to create 'cleaned_comment' based on the processed 'Comment' column
df['cleaned_comment'] = df['Comment']

# Proceed with further analysis, e.g., Word Embeddings, TF-IDF Vectorization, K-Means Clustering, etc.


# Creating Word Embeddings using Word2Vec
sentences = df['cleaned_comment'].apply(word_tokenize).tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


Create Word Embeddings (Optional)
Use HDF5 datasets or Numpy memmap for managing large arrays efficiently. Here’s an example using HDF5 with h5py for storing Word2Vec embeddings

In [10]:
import h5py
import numpy as np

# Create an HDF5 file to store Word2Vec embeddings
with h5py.File('word_embeddings.h5', 'w') as f:
    # Create a dataset to store embeddings
    embeddings = f.create_dataset('embeddings', shape=(len(df), 100), dtype='float32')

    # Generate Word2Vec embeddings and store in HDF5
    for i, text in enumerate(df['cleaned_comment']):
        tokens = word_tokenize(text)
        embedding = np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv], axis=0)
        embeddings[i] = embedding


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


NumPy Array for Embeddings
Once you have created and populated the HDF5 dataset (embeddings) with Word2Vec embeddings, you can then use the NumPy array (embeddings) to perform further analysis or clustering.

In [11]:
import numpy as np

# Assuming word2vec_model is already trained and available
# Example of embedding calculation
embeddings = np.zeros((len(df), word2vec_model.vector_size))

for i, text in enumerate(df['cleaned_comment']):
    tokens = word_tokenize(text)
    # Calculate mean embedding for the tokens present in the Word2Vec model
    embedding = np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv], axis=0)
    embeddings[i] = embedding

# Now you have embeddings for each comment in `embeddings`
# You can use these embeddings for further analysis or clustering


Create Word Embeddings Using Word2Vec
Create word embeddings for the cleaned comments.

In [8]:
# Tokenize comments for Word2Vec
tokenized_comments = [comment.split() for comment in df['Comment']]

# Create Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=1, workers=4)

# Display some example word vectors
print(word2vec_model.wv['pulsechain'])  # Replace 'pulsechain' with any word from your dataset


[-0.33112907  1.2901008  -0.36544156 -0.04369812  1.0635742  -0.65108395
 -0.42381194  0.90814     0.59215975  0.9756391  -0.48457068  0.6411909
 -1.2259996   0.2610807   0.11671583  0.27095377  0.6651358   0.6268397
 -0.17041695 -1.0249975  -0.5726376   1.0090541   0.42383653  0.28515822
  1.8215097  -0.57750946  0.3159651  -2.2978487  -0.7036471  -0.14924926
  0.5310414  -0.49998596  1.0472157  -1.4371644   1.0388509   0.18657856
 -1.0701095  -0.47160572  0.43994835 -1.4521018   1.2276946  -1.6276362
  0.5493709  -0.27118105 -1.0318009  -1.1349488  -1.0754507   0.7835427
  0.19718753 -0.18756658 -0.2722877   1.0540453  -0.01487218  1.9680662
  0.5263632   0.15488361 -0.01458454 -0.41612262 -0.1240278   0.5430308
  0.31109828 -0.7314058   0.34694654 -0.16031615 -1.133222   -1.5228983
 -0.9558177  -0.3224629  -0.4324564   0.6427003   1.2457738  -0.9390156
  0.69599897 -0.1399205   0.01746093  0.39703706  1.4198766  -0.64507836
 -0.4882016   0.37212738  0.9617586  -0.6841652  -0.6538225

Vectorize Comments Using TF-IDF
Convert the comments to a TF-IDF matrix for clustering.

In [9]:
# Vectorize comments using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Comment'])

# Display the shape of the TF-IDF matrix
print(X.shape)


(110567, 66302)


Perform K-Means clustering
Cluster the comments using K-Means and add cluster labels to the DataFrame.

In [10]:
# Perform K-Means clustering
num_clusters = 3  # Change this to 2 if you want two clusters (positive/negative)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Add cluster labels to the data
df['Cluster'] = labels

# Display the first few rows of the dataframe with cluster labels
df.head()




Unnamed: 0,Title,Comment,Cluster
0,Pulsechain is dead!!,prove right validators active unique wallets h...,1
1,Pulsechain is dead!!,idk youre talking eco thriving,1
2,Pulsechain is dead!!,didnt people say hex bull market right hold ra...,0
3,Pulsechain is dead!!,low quality post top chains tvl h vol atm,1
4,Pulsechain is dead!!,please sell youre worth time,1


8. Visualize Clusters Using PCA


In [1]:
# Reduce dimensions using PCA
# Assuming X is the TF-IDF matrix created in the previous code blocks
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X.toarray())

df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1]

# Plot PCA results
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="Cluster",
    palette=sns.color_palette("hsv", num_clusters),
    data=df,
    legend="full",
    alpha=0.3
)
plt.title('PCA of Clusters')
plt.show()


NameError: name 'PCA' is not defined