In [1]:
!pip install -q tensorflow pandas numpy scikit-learn nltk


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import warnings
warnings.filterwarnings('ignore')

# Deep Learning imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# NLP imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Download required NLTK data
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [3]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# Set GPU memory growth to avoid OOM errors
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.20.0
GPU Available: []


In [4]:
# ============== DATA PREPROCESSING ==============

class TweetPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_tweet(self, text):
        """Clean and preprocess tweet text"""
        if pd.isna(text):
            return ""
        
        # Convert to string and lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions but keep hashtags as they might be meaningful
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtag symbol but keep the word
        text = re.sub(r'#', '', text)
        
        # Remove special characters and digits but keep spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # For gaming tweets, we might want to keep some gaming-specific terms
        # Don't remove all stopwords for better context in gaming tweets
        tokens = text.split()
        
        # Only remove very common stopwords that don't affect sentiment
        very_common_stops = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'}
        tokens = [word for word in tokens if word not in very_common_stops or len(word) > 2]
        
        return ' '.join(tokens)

In [5]:
# ============== LOAD AND PREPARE DATA ==============

def load_twitter_data(filepath='twitter_training.csv'):
    """
    Load Twitter dataset with format: ID, Entity, Sentiment, Text
    """
    try:
        # Load CSV with proper column names
        # Assuming columns are: ID, Entity, Sentiment, Text
        df = pd.read_csv(filepath, names=['id', 'entity', 'sentiment', 'text'], 
                         header=None, encoding='utf-8')
        
        # Convert sentiment labels to lowercase for consistency
        df['sentiment'] = df['sentiment'].str.lower().str.strip()
        
        # Map sentiment to numerical values
        sentiment_mapping = {
            'positive': 2,
            'neutral': 1,
            'negative': 0
        }
        
        # Apply mapping
        df['sentiment_numeric'] = df['sentiment'].map(sentiment_mapping)
        
        # Remove rows with unmapped sentiments
        df = df.dropna(subset=['sentiment_numeric'])
        df['sentiment_numeric'] = df['sentiment_numeric'].astype(int)
        
        print(f"Loaded {len(df)} tweets")
        print(f"Entities: {df['entity'].value_counts().head()}")
        print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")
        
        # Rename for consistency with rest of code
        df['sentiment'] = df['sentiment_numeric']
        
        return df[['text', 'sentiment', 'entity']]
        
    except FileNotFoundError:
        print(f"File {filepath} not found. Using sample data for demonstration.")
        return load_sample_data()
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Using sample data for demonstration.")
        return load_sample_data()

def load_sample_data():
    """
    Sample data for demonstration if file not found
    """
    # Sample gaming-related tweets similar to your format
    sample_tweets = [
        "im getting on borderlands and i will murder you all",
        "This game is terrible, waste of money",
        "Just another average shooter game",
        "Best game ever! Can't stop playing",
        "Worst gaming experience, uninstalling now",
        "Gameplay is okay but nothing special",
        "Love this game so much, amazing graphics",
        "Broken mechanics, needs major fixes",
        "It's alright, worth trying on sale",
        "Absolutely fantastic multiplayer experience"
    ] * 50  # Multiply for more training data
    
    sample_entities = ['Borderlands', 'Gaming', 'Borderlands', 'Gaming', 'FPS',
                      'Borderlands', 'Gaming', 'FPS', 'Borderlands', 'Gaming'] * 50
    
    # Labels: 0=negative, 1=neutral, 2=positive
    sample_labels = [2, 0, 1, 2, 0, 1, 2, 0, 1, 2] * 50
    
    df = pd.DataFrame({
        'text': sample_tweets,
        'sentiment': sample_labels,
        'entity': sample_entities
    })
    
    return df

# Load your data
# Try to load the actual CSV file, fallback to sample if not found
df = load_twitter_data('twitter_training.csv')  # Specify your file path here

print(f"Dataset shape: {df.shape}")
print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")
print(f"Sample tweets:")
print(df[['text', 'sentiment']].head())

Loaded 61692 tweets
Entities: entity
TomClancysGhostRecon    2322
MaddenNFL               2310
TomClancysRainbowSix    2304
Microsoft               2226
Nvidia                  2208
Name: count, dtype: int64
Sentiment distribution:
sentiment
negative    22542
positive    20832
neutral     18318
Name: count, dtype: int64
Dataset shape: (61692, 3)
Sentiment distribution:
sentiment
0    22542
2    20832
1    18318
Name: count, dtype: int64
Sample tweets:
                                                text  sentiment
0  im getting on borderlands and i will murder yo...          2
1  I am coming to the borders and I will kill you...          2
2  im getting on borderlands and i will kill you ...          2
3  im coming on borderlands and i will murder you...          2
4  im getting on borderlands 2 and i will murder ...          2


In [6]:
# ============== PREPROCESS DATA ==============

preprocessor = TweetPreprocessor()
df['cleaned_text'] = df['text'].apply(preprocessor.clean_tweet)

# Remove empty texts
df = df[df['cleaned_text'] != '']

print(f"Dataset after cleaning: {df.shape}")

Dataset after cleaning: (60590, 4)


In [7]:
# ============== TOKENIZATION AND PADDING ==============

# Hyperparameters optimized for 8GB GPU
MAX_WORDS = 10000  # Vocabulary size
MAX_LEN = 100      # Maximum sequence length
EMBEDDING_DIM = 128  # Embedding dimension

# Initialize tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Prepare labels
y = df['sentiment'].values

# Convert to categorical (one-hot encoding)
num_classes = len(np.unique(y))
y_categorical = keras.utils.to_categorical(y, num_classes)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (48472, 100)
Test set: (12118, 100)


In [8]:
# ============== BUILD MODEL ==============

def create_lstm_model():
    """Create LSTM model optimized for 8GB GPU"""
    model = Sequential([
        # Embedding layer
        Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN),
        
        # Bidirectional LSTM with dropout
        Bidirectional(LSTM(64, return_sequences=True, dropout=0.2)),
        Bidirectional(LSTM(32, dropout=0.2)),
        
        # Dense layers
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.3),
        
        # Output layer
        Dense(num_classes, activation='softmax')
    ])
    
    return model

def create_cnn_model():
    """Alternative: CNN model (faster training, less GPU memory)"""
    model = Sequential([
        Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Choose model based on your preference
# Use CNN for faster training and less memory usage
# Use LSTM for potentially better accuracy
model = create_lstm_model()  # or create_cnn_model()

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [9]:
# ============== TRAIN MODEL ==============

# Callbacks for better training
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        'best_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        verbose=1,
        min_lr=0.00001
    )
]

# Train model with smaller batch size for GPU memory optimization
BATCH_SIZE = 32  # Adjust based on GPU memory
EPOCHS = 20

history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.5306 - loss: 0.9440
Epoch 1: val_accuracy improved from None to 0.77318, saving model to best_model.keras
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 119ms/step - accuracy: 0.6480 - loss: 0.8043 - val_accuracy: 0.7732 - val_loss: 0.5777 - learning_rate: 0.0010
Epoch 2/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.8237 - loss: 0.4852
Epoch 2: val_accuracy improved from 0.77318 to 0.82341, saving model to best_model.keras
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 112ms/step - accuracy: 0.8313 - loss: 0.4643 - val_accuracy: 0.8234 - val_loss: 0.4538 - learning_rate: 0.0010
Epoch 3/20
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.8841 - loss: 0.3210
Epoch 3: val_accuracy improved from 0.82341 to 0.84332, saving model to best_model.keras
[

In [10]:
# ============== EVALUATE MODEL ==============

# Load best model
model = load_model('best_model.keras')

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Get predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Classification report
sentiment_labels = ['Negative', 'Neutral', 'Positive']
print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=sentiment_labels))


Test Accuracy: 0.8841
Test Loss: 0.5470
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 39ms/step

Classification Report:
              precision    recall  f1-score   support

    Negative       0.89      0.90      0.90      4436
     Neutral       0.87      0.86      0.87      3582
    Positive       0.88      0.88      0.88      4100

    accuracy                           0.88     12118
   macro avg       0.88      0.88      0.88     12118
weighted avg       0.88      0.88      0.88     12118



In [11]:
# ============== SAVE MODEL AND PREPROCESSORS ==============

def save_model_artifacts(model, tokenizer, preprocessor, model_name='twitter_sentiment'):
    """Save all model artifacts for future use"""
    
    # 1. Save the Keras model
    model.save(f'{model_name}_model.keras')
    print(f"✓ Model saved as '{model_name}_model.keras'")
    
    # 2. Save the tokenizer
    with open(f'{model_name}_tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✓ Tokenizer saved as '{model_name}_tokenizer.pickle'")
    
    # 3. Save preprocessing configuration
    config = {
        'max_words': MAX_WORDS,
        'max_len': MAX_LEN,
        'embedding_dim': EMBEDDING_DIM,
        'num_classes': num_classes,
        'sentiment_labels': sentiment_labels
    }
    with open(f'{model_name}_config.pickle', 'wb') as handle:
        pickle.dump(config, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"✓ Configuration saved as '{model_name}_config.pickle'")
    
    print("\n✅ All model artifacts saved successfully!")

# Save everything
save_model_artifacts(model, tokenizer, preprocessor)

✓ Model saved as 'twitter_sentiment_model.keras'
✓ Tokenizer saved as 'twitter_sentiment_tokenizer.pickle'
✓ Configuration saved as 'twitter_sentiment_config.pickle'

✅ All model artifacts saved successfully!
