In [20]:
import os

file_path = '/content/Tweets.csv'

if os.path.exists(file_path):
    print(f"{file_path} exists.")
else:
    print(f"{file_path} does not exist.")


/content/Tweets.csv exists.


In [4]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df= pd.read_csv(file_path)

df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
df_new= df[['text','airline_sentiment']]

In [6]:
df_new.head(5)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


# Let us count the total negative, positive, neutral tweets

In [7]:
df_new['airline_sentiment'].value_counts()

Unnamed: 0_level_0,count
airline_sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


# Before we proceed to Tokenization we need to first clean our text

In [8]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_new['clean_text'] = df_new['text'].apply(clean_text)


# Now, we will encode our texts to numerics

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_new['label'] = label_encoder.fit_transform(df_new['airline_sentiment'])


In [10]:
df_new.head(5)

'''
so neutral is 1
negative is 0
positive is 2
'''


'\nso neutral is 1\nnegative is 0\npositive is 2\n'

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df_new['clean_text'])

sequences = tokenizer.texts_to_sequences(df_new['clean_text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

# We have used post padding here as generally for LSTM's we prefer post padding

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np

x = padded_sequences
y = np.array(df_new['label'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset


x_train_tensor = torch.tensor(x_train,dtype=torch.long)
y_train_tensor = torch.tensor(y_train,dtype=torch.long)
x_test_tensor = torch.tensor(x_test,dtype=torch.long)
y_test_tensor = torch.tensor(y_test,dtype=torch.long)

train_set = TensorDataset(x_train_tensor,y_train_tensor)
test_set= TensorDataset(x_test_tensor, y_test_tensor)



In [14]:
train_loader= DataLoader(train_set, batch_size=64,shuffle=True)
test_loader= DataLoader(test_set,batch_size=64, shuffle=False)

# Let us now build our LSTM model

In [15]:
class LSTM(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers=10):
        super(LSTM,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.dropout(hidden[-1])
        return self.fc(out)



In [16]:
vocab_size = 10000
embedding_dim = 128
hidden_dim = 128
output_dim = 3

model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [18]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/10, Loss: 0.9313
Epoch 2/10, Loss: 0.9278
Epoch 3/10, Loss: 0.9249
Epoch 4/10, Loss: 0.9246
Epoch 5/10, Loss: 0.9236
Epoch 6/10, Loss: 0.9245
Epoch 7/10, Loss: 0.9233
Epoch 8/10, Loss: 0.9239
Epoch 9/10, Loss: 0.9230
Epoch 10/10, Loss: 0.9234


In [19]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {(correct / total) * 100:.2f}%")


Test Accuracy: 64.52%
