In [10]:
import numpy as np
import pandas as pd
import keras
from sklearn.model_selection import train_test_split


In [5]:
df = pd.read_csv('train.tsv', sep='\t')

In [9]:
X_text = df['Phrase']
y = df['Sentiment']

In [12]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [18]:


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0


In [19]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Example parameters
max_vocab_size = 10000
embedding_dim = 100
max_sequence_length = 200

# Tokenizer and padding
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text)
sequences = tokenizer.texts_to_sequences(X_text)

# Padding
X_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Model architecture
model = Sequential([
    Embedding(max_vocab_size, embedding_dim, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax')  # 5 classes for Sentiment
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))



Epoch 1/5
[1m3902/3902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m821s[0m 210ms/step - accuracy: 0.5135 - loss: 1.2838 - val_accuracy: 0.5011 - val_loss: 1.2957
Epoch 2/5
[1m 913/3902[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m8:51[0m 178ms/step - accuracy: 0.5064 - loss: 1.2862

KeyboardInterrupt: 

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
MAX_VOCAB_SIZE = 10000  # Maximum vocabulary size
MAX_SEQUENCE_LENGTH = 200  # Maximum length of a sequence
EMBEDDING_DIM = 100  # Dimension of word embeddings
BATCH_SIZE = 32
EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example dataset
X_text = df['Phrase']  # Replace with your text data
y = df['Sentiment']  # Replace with your target labels (0–4)

# Tokenization and padding
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text)
sequences = tokenizer.texts_to_sequences(X_text)
X_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.long)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

# PyTorch Dataset and DataLoader
class SentimentDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

train_dataset = SentimentDataset(X_train_tensor, y_train_tensor)
val_dataset = SentimentDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use the last hidden state
        output = self.fc(lstm_out)
        return output

# Instantiate the model
model = SentimentLSTM(
    vocab_size=MAX_VOCAB_SIZE,
    embed_dim=EMBEDDING_DIM,
    hidden_dim=128,
    output_dim=5,  # Number of classes (0–4)
    n_layers=2,
    dropout=0.5
).to(DEVICE)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and evaluation loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
                y_true.extend(y_batch.cpu().numpy())
                y_pred.extend(torch.argmax(outputs, dim=1).cpu().numpy())

        f1 = f1_score(y_true, y_pred, average='weighted')
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, "
              f"Val Loss: {val_loss / len(val_loader):.4f}, F1 Score: {f1:.4f}")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, EPOCHS)

# Classification report
y_true, y_pred = [], []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
        outputs = model(X_batch)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(torch.argmax(outputs, dim=1).cpu().numpy())

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["negative", "somewhat negative", "neutral", "somewhat positive", "positive"]))


Epoch 1/5, Train Loss: 1.2879, Val Loss: 1.2965, F1 Score: 0.3345
Epoch 2/5, Train Loss: 1.2845, Val Loss: 1.2965, F1 Score: 0.3345
Epoch 3/5, Train Loss: 1.2835, Val Loss: 1.2966, F1 Score: 0.3345
Epoch 4/5, Train Loss: 1.2828, Val Loss: 1.2960, F1 Score: 0.3345
Epoch 5/5, Train Loss: 1.2825, Val Loss: 1.2956, F1 Score: 0.3345

Classification Report:
                   precision    recall  f1-score   support

         negative       0.00      0.00      0.00      1416
somewhat negative       0.00      0.00      0.00      5527
          neutral       0.50      1.00      0.67     15639
somewhat positive       0.00      0.00      0.00      6707
         positive       0.00      0.00      0.00      1923

         accuracy                           0.50     31212
        macro avg       0.10      0.20      0.13     31212
     weighted avg       0.25      0.50      0.33     31212



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0        156061
1        156062
2        156063
3        156064
4        156065
          ...  
66287    222348
66288    222349
66289    222350
66290    222351
66291    222352
Name: PhraseId, Length: 66292, dtype: int64

In [22]:
df_test = pd.read_csv('test.tsv', sep='\t')
X_text_test = df_test['Phrase']

In [28]:
# Replace NaN with an empty string
df_test['Phrase'] = df_test['Phrase'].fillna('')

# Ensure all entries are strings
df_test['Phrase'] = df_test['Phrase'].astype(str)


In [31]:
# Tokenize and pad the test data
sequences_test = tokenizer.texts_to_sequences(X_text_test)  # Use the same tokenizer as training
X_test_padded = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
# Convert test data to PyTorch tensor
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long).to(DEVICE)

# Make predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_test_pred = torch.argmax(outputs, dim=1).cpu().numpy()  # Convert predictions to CPU and numpy


AttributeError: 'float' object has no attribute 'lower'

In [24]:
print(df_test['Phrase'].isnull().sum())  # Check for missing values
print(df_test['Phrase'].apply(lambda x: isinstance(x, str)).value_counts())  # Check data types


1
Phrase
True     66291
False        1
Name: count, dtype: int64


In [None]:
invalid_rows = df_test[~df_test['Phrase'].apply(lambda x: isinstance(x, str))]
print(invalid_rows)



Empty DataFrame
Columns: [PhraseId, SentenceId, Phrase]
Index: []


In [32]:
# Identify and handle missing or invalid values
df_test['Phrase'] = df_test['Phrase'].fillna('')  # Replace NaN with an empty string
df_test.loc[~df_test['Phrase'].apply(lambda x: isinstance(x, str)), 'Phrase'] = ''  # Replace invalid values

# Validate that all entries are strings
print(df_test['Phrase'].apply(lambda x: isinstance(x, str)).value_counts())  # Should be all True

# Tokenize and pad
sequences_test = tokenizer.texts_to_sequences(df_test['Phrase'])  # Tokenize
X_test_padded = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


Phrase
True    66292
Name: count, dtype: int64


In [33]:
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long).to(DEVICE)

# Make predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_test_pred = torch.argmax(outputs, dim=1).cpu().numpy()  # Convert predictions to CPU and numpy

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.32 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 10.14 GiB is allocated by PyTorch, and 43.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [34]:
from torch.utils.data import DataLoader, TensorDataset

# Create DataLoader for the test data
test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Predict in batches
y_test_pred = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        X_batch = batch[0].to(DEVICE)
        outputs = model(X_batch)
        y_test_pred.extend(torch.argmax(outputs, dim=1).cpu().numpy())  # Store predictions

# Convert predictions to numpy array
y_test_pred = np.array(y_test_pred)


In [37]:
# Combine predictions with PhraseId
df_test['Sentiment'] = y_test_pred

# Save as .tsv file
submission_file = 'submission.csv'
df_test[['PhraseId', 'Sentiment']].to_csv(submission_file, index=False)

print(f"Submission file saved to {submission_file}")


Submission file saved to submission.csv
