In [2]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Load your dataset into a pandas DataFrame
# Assuming your DataFrame is named 'df' with columns 'class' and 'tweet'
# For example:
df = pd.read_csv('balanced_dataset.csv')

# Tokenize the tweets into words
tokenized_tweets = df['tweet'].apply(word_tokenize)

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_tweets, vector_size=100, window=5, min_count=1, workers=4)

# Create word embeddings for each tweet
def get_tweet_vector(tweet):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    # Get the word vectors for each token and take the mean
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        # If no valid tokens are found, return a vector of zeros
        return [0] * 100

# Apply the function to each tweet and create a new DataFrame
embedding_df = pd.DataFrame(df['tweet'].apply(get_tweet_vector).tolist())

# Concatenate the class labels with the embeddings DataFrame
embedding_df['class'] = df['class']

# Save the embeddings to a CSV file
embedding_df.to_csv('word2vec_embeddings_3.csv', index=False)
