<a href="https://colab.research.google.com/github/Koanah/SentimentAnalysis/blob/main/GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Load and preprocess data
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout


In [None]:
#Load dataset
filename = "/content/sentiment_dataset.csv"
columns = ["target", "ids", "date", "flag", "user", "text"]
data = pd.read_csv(filename, encoding="ISO-8859-1", names=columns)
print(data['target'].unique())

In [None]:
# Only keep 0 and 4 labels
data = data[data['target'].isin([0, 4])]

# Convert 4 to 1 (so 0 = negative, 1 = positive)
data['target'] = data['target'].replace({4: 1})

print("Label distribution:")
print(data['target'].value_counts())

In [None]:
# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[\w]+', '', text)  # remove mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'#', '', text)  # remove hashtag symbol
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
# Step 2: Tokenize and pad sequences
tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

maxlen = 100  # max length of input sequences
padded = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

In [None]:
# Step 3: Train/test split
X = padded
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Build GRU model
model = Sequential()
model.add(Embedding(input_dim=50000, output_dim=64, input_length=maxlen))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=4, batch_size=128, validation_data=(X_test, y_test))

In [None]:
# Step 6: Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))