In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Path to dataset files: /Users/marius/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2


In [2]:
import os

print("Path to dataset files:", path)
print("Files in dataset directory:", os.listdir(path))

Path to dataset files: /Users/marius/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2
Files in dataset directory: ['training.1600000.processed.noemoticon.csv']


In [3]:
import pandas as pd

file_path = os.path.join(path, "training.1600000.processed.noemoticon.csv")

columns = ['target', 'id', 'date', 'flag', 'user', 'text']

full_data = pd.read_csv(file_path, encoding='latin-1', names=columns)

In [4]:
#we are interested in the labels and tweets only

data = full_data[['target', 'text']].copy()
data.rename(columns={"target": "label"}, inplace=True)

print(data[:10])

   label                                               text
0      0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      0  is upset that he can't update his Facebook by ...
2      0  @Kenichan I dived many times for the ball. Man...
3      0    my whole body feels itchy and like its on fire 
4      0  @nationwideclass no, it's not behaving at all....
5      0                      @Kwesidei not the whole crew 
6      0                                        Need a hug 
7      0  @LOLTrish hey  long time no see! Yes.. Rains a...
8      0               @Tatiana_K nope they didn't have it 
9      0                          @twittera que me muera ? 


In [5]:
# Preprocessing function without parallelization (on my hardware took ~3min)

''' Uncomment this if you want to run the preprocessing without parallelization!

def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens) 

data['processed_text'] = data['text'].apply(preprocess_text)
print(data.head())

'''

' Uncomment this if you want to run the preprocessing without parallelization!\n\ndef preprocess_text(text):\n    # Tokenize text\n    tokens = word_tokenize(text.lower())  # Lowercase and tokenize\n\n    # Remove punctuation and non-alphabetic characters\n    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]\n    tokens = [token for token in tokens if token]  # Remove empty strings\n\n    # Remove stopwords\n    stop_words = set(stopwords.words(\'english\'))\n    filtered_tokens = [word for word in tokens if word not in stop_words]\n\n    # Lemmatize tokens\n    lemmatizer = WordNetLemmatizer()\n    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n\n    return \' \'.join(lemmatized_tokens) \n\ndata[\'processed_text\'] = data[\'text\'].apply(preprocess_text)\nprint(data.head())\n\n'

In [6]:
# Preprocessing function with parallelization (on my hardware took ~2min)
# Roughly a 33% speed increase

from joblib import Parallel, delayed
import nltk
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens)  # Return processed text as a single string

def preprocess_and_return(row):
    return preprocess_text(row['text'])

data['processed_text'] = Parallel(n_jobs=-1)(delayed(preprocess_and_return)(row) for index, row in data.iterrows())

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [7]:
first_tweet = data.iloc[0]
print("Original Tweet:", first_tweet['text'])
print("Processed Tweet:", first_tweet['processed_text'])

Original Tweet: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Processed Tweet: switchfoot http twitpiccomyzl awww bummer shoulda got david carr third day


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

class SentimentNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # Input to hidden layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # Hidden to output layer
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x  # Output logits for multi-class classification

# Hyperparameters
embedding_dim = 50
hidden_dim = 32
output_dim = 3  # Output dimension is 3 for multi-class classification (negative, neutral, positive)


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['processed_text']).toarray()
X_test_tfidf = vectorizer.transform(test_data['processed_text']).toarray()

X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)

y_train_tensor = torch.tensor(train_data['label'].apply(lambda x: {0: 0, 2: 1, 4: 2}[x]).values, dtype=torch.long)
y_test_tensor = torch.tensor(test_data['label'].apply(lambda x: {0: 0, 2: 1, 4: 2}[x]).values, dtype=torch.long)

model = SentimentNN(input_dim=1000, hidden_dim=64, output_dim=3)  # Three output classes

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


epochs = 10
batch_size = 64
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Get predictions and compute accuracy
        _, predicted = torch.max(outputs, 1)  # Get class with max probability
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Evaluation on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    correct_predictions = (predicted == y_test_tensor).sum().item()
    total_predictions = y_test_tensor.size(0)
    accuracy = correct_predictions / total_predictions

print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/10, Loss: 0.5050, Accuracy: 0.7510
Epoch 2/10, Loss: 0.4891, Accuracy: 0.7602
Epoch 3/10, Loss: 0.4823, Accuracy: 0.7644
Epoch 4/10, Loss: 0.4774, Accuracy: 0.7678
Epoch 5/10, Loss: 0.4735, Accuracy: 0.7701
Epoch 6/10, Loss: 0.4702, Accuracy: 0.7722
Epoch 7/10, Loss: 0.4675, Accuracy: 0.7739
Epoch 8/10, Loss: 0.4649, Accuracy: 0.7756
Epoch 9/10, Loss: 0.4629, Accuracy: 0.7769
Epoch 10/10, Loss: 0.4611, Accuracy: 0.7779
Test Accuracy: 0.7599


In [9]:
def predict_sentiment(input_text, model, vectorizer):
    # Preprocess input (apply same preprocessing as during training)
    # If you had specific preprocessing (like lowercasing, removing stopwords, etc.), apply here
    processed_text = input_text.lower()  # Example preprocessing step

    # Convert text to TF-IDF features using the trained vectorizer
    input_tfidf = vectorizer.transform([processed_text]).toarray()

    # Convert to PyTorch tensor
    input_tensor = torch.tensor(input_tfidf, dtype=torch.float32)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation since we are not training
    with torch.no_grad():
        # Pass the input through the model
        outputs = model(input_tensor)

        # Get predicted class (index of the highest value)
        _, predicted_class = torch.max(outputs, 1)

        # Map predicted class back to original label
        label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
        predicted_label = label_mapping[predicted_class.item()]

    return predicted_label

# Example usage
input_text = "The weather is average today."
predicted_sentiment = predict_sentiment(input_text, model, vectorizer)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: negative
