In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model

from tensorflow.keras.optimizers.legacy import Adam
from sklearn.metrics import accuracy_score

In [131]:
df = pd.read_csv('preprocessed_data.csv')

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38050 entries, 0 to 38049
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    38050 non-null  int64  
 1   clean_text    38050 non-null  object 
 2   category      38050 non-null  float64
 3   Length        38050 non-null  float64
 4   Preprocessed  38050 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 1.5+ MB


In [133]:
# Preprocess the data
X = df['Preprocessed']
y = df['category']

In [134]:
# Tokenize the text and train Word2Vec embeddings
vector_size = 100
window = 5
min_count = 1
sg = 1  # Skip-gram (use sg=0 for CBOW)

In [135]:
tokenized_texts = [text.split() for text in X]
word2vec_model = Word2Vec(tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, sg=sg)


In [136]:
# Create average word embeddings for each text
X_embeddings = np.array([
    np.mean([word2vec_model.wv[word] for word in text if word in word2vec_model.wv], axis=0)
    for text in tokenized_texts
])


In [137]:
# Split into train and validation sets
X_train_embeddings, X_val_embeddings, y_train, y_val = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)


In [138]:
X_train_embeddings.shape

(30440, 100)

In [139]:
X_val_embeddings.shape

(7610, 100)

In [148]:
# Assuming your original labels are stored in the 'y_train' and 'y_val' variables
y_train = np.where(y_train == -1, 0, y_train)  # Convert -1 to 0
y_val = np.where(y_val == -1, 0, y_val)  # Convert -1 to 0


In [149]:
# Define the sentiment analysis model
def create_sentiment_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape, dtype='float32', name='input_embeddings')
    dense_layer = Dense(128, activation='relu')(input_layer)  # Adjust units as needed
    output_layer = Dense(num_classes, activation='softmax')(dense_layer)

    model = Model(inputs=input_layer, outputs=output_layer, name='sentiment_model')
    return model

In [150]:
num_classes = 2  # Binary sentiment classification
sentiment_model = create_sentiment_model(input_shape=(X_train_embeddings.shape[1],), num_classes=num_classes)
sentiment_model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [151]:
# Training
num_epochs = 10
batch_size = 16

history = sentiment_model.fit(
    X_train_embeddings, y_train,
    validation_data=(X_val_embeddings, y_val),
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [152]:
# Evaluation
y_pred = sentiment_model.predict(X_val_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_val, y_pred_labels)
print(f'Validation Accuracy: {accuracy:.2f}')

Validation Accuracy: 0.72
