# Sentiment Analysis on Tweets

## Import Libraries

In [None]:
import os
import pandas as pd
import re
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense

## Load Datasets

In [None]:
load_dotenv()

In [None]:
data = pd.read_csv(os.getenv('SENTIMENT_140'), encoding='latin1', names=['target', 'id', 'date', 'flag', 'user', 'text'])

## Data Preprocessing

In [None]:
# Clean the text: Remove hashtags, mentions, and special characters using re
data['cleaned_text'] = data['text'].apply(lambda x: re.sub(r'@\w+|#\w+|http\S+', '', x))

## Feature Engineering

In [None]:
# Tokenize and vectorize text using TF-IDF:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])

# Define labels (y)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model

### Naive Bayes Classifier

In [None]:
# Naive Bayes Classifier as a simple model
model_mnb = MultinomialNB()
model_mnb.fit(X_train, y_train)

### LSTM

In [None]:
# Advanced model, LSTM with Keras
model_lstm = Sequential([

    Embedding(input_dim=5000, output_dim=64),

    LSTM(128, return_sequences=True),

    LSTM(64),

    Dense(1, activation='sigmoid')

])

### Model Evaluation

#### Naive Bayes Classifier

In [None]:
# Use Precision, Recall, and F1-score to evaluate performance
y_pred = model_mnb.predict(X_test)

# Specify pos_label=4 since positive class is labeled as 4
print(f"Precision: {precision_score(y_test, y_pred, pos_label=4)}, Recall: {recall_score(y_test, y_pred, pos_label=4)}, F1: {f1_score(y_test, y_pred, pos_label=4)}")

#### Model Evaluation - LSTM

In [None]:
# Convert Scipy sparse matrix to dense format
X_test_dense = X_test.toarray()

# Use Precision, Recall, and F1-score to evaluate performance
y_pred = model_lstm.predict(X_test_dense)

# Specify pos_label=4 since positive class is labeled as 4
print(f"Precision: {precision_score(y_test, y_pred, pos_label=4)}, Recall: {recall_score(y_test, y_pred, pos_label=4)}, F1: {f1_score(y_test, y_pred, pos_label=4)}")