In [1]:
import  random
import numpy as np

In [14]:

np.random.randint(1, 100, 10)

array([64, 60, 21, 33, 76, 58, 22, 89, 49, 91], dtype=int32)

In [16]:
# Set both seeds
random.seed(42)
np.random.seed(42)

# Generate random numbers
print(np.random.randint(1, 100, 10))

[52 93 15 72 61 21 83 87 75 75]


In [17]:
# Training sript
import os
import re
import random
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import load_model

## Configuration
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
data_path = "data/Tweets.csv"
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# hyperparams
max_num_words = 30000
max_seq_len = 100
Embedding_dim = 100
batch_size = 64
epochs = 6
test_size = 0.15
val_size = 0.15


# Utilities
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Load data
df = pd.read_csv(data_path)

# Inspecting cols names 
print(f"Columns: {df.columns.tolist()}")

Columns: ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']


In [18]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [19]:
# cleaning
df = df[['text', 'airline_sentiment']].dropna()

In [20]:
df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [21]:
df['text'] = df['text'].astype(str).apply(clean_text)

In [22]:
df.head()

Unnamed: 0,text,airline_sentiment
0,what said,neutral
1,plus you ve added commercials to the experienc...,positive
2,i didn t today must mean i need to take anothe...,neutral
3,it s really aggressive to blast obnoxious ente...,negative
4,and it s a really big bad thing about it,negative


In [23]:
## Enconding labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])
print(f"Label classes: {list(le.classes_)}")

Label classes: ['negative', 'neutral', 'positive']


In [25]:
# Split train and test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df['text'], df['label'], test_size=test_size, stratify=df['label'], random_state=SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=val_size/(1- test_size), stratify=y_train_full, random_state=SEED
)

In [26]:
# Computing class weights ti mitigate imbalance
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = {int(cls): float(w) for cls, w in zip(classes, class_weights)}
print("Class weights:", class_weight_dict)

Class weights: {0: 0.5317559153175592, 1: 1.5749193176579068, 2: 2.06404833836858}
