In [8]:
!pip install sagemaker ipywidgets --upgrade

Collecting sagemaker
  Using cached sagemaker-2.253.1-py3-none-any.whl.metadata (17 kB)
Collecting attrs<26,>=24 (from sagemaker)
  Using cached attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting boto3<2.0,>=1.39.5 (from sagemaker)
  Using cached boto3-1.40.55-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.41.0,>=1.40.55 (from boto3<2.0,>=1.39.5->sagemaker)
  Using cached botocore-1.40.55-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3<2.0,>=1.39.5->sagemaker)
  Using cached s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Using cached sagemaker-2.253.1-py3-none-any.whl (1.7 MB)
Using cached attrs-25.4.0-py3-none-any.whl (67 kB)
Using cached boto3-1.40.55-py3-none-any.whl (139 kB)
Using cached botocore-1.40.55-py3-none-any.whl (14.1 MB)
Using cached s3transfer-0.14.0-py3-none-any.whl (85 kB)
Installing collected packages: attrs, botocore, s3transfer, boto3, sagemaker
[2K  Attempting uninstall: attrs
[2K    Found existing insta

In [18]:
import sagemaker, boto3, json
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker import get_execution_role

aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = get_execution_role()

prefix = 'sagemaker/HousingPrice'

from sagemaker.tensorflow.model import TensorFlowModel
sm_mod = TensorFlowModel(model_data = 's3://sagemaker-us-east-2-116981761397/Completed_Models/tweet_sentiment_model.tar.gz', role = role, framework_version='2.19')


predictor = sm_mod.deploy(name = 'TweetSentiment', initial_instance_count=1,
                                   instance_type='ml.m4.xlarge')

--

# Quick NLP Strategy Overview

1. basically just make a regression ML  model, convert all of the text into a matrix, then put that matrix into some kind of ML model

    a. At very basic level, could do all of this manually

    b. What's much easier, however, is to import basic vectorizers from SKLearn, then use those. Way easier and better than manually converting
        For example, TfidfVectorizer, or others from here: https://scikit-learn.org/stable/api/sklearn.feature_extraction.html
        tfIDF vectorizer weights text inversely to their presence in the documents. This makes common words ('A', 'IS', 'ME') matter a lot less than less common words
        Also, each vectorizer has many different ways of creating n-grams. Some are more complex, but a lot more resilient to misspellings, such as 'char_wb' in CountVectorizer.

    c. Most of these vectorizers are great for SIMPLE solutions, but usually fail when trying to understand a sentence, or classify a complex sentence, since they're order-agnostic and don't really understand
            how each of these words fit together in a sentence. 
2. Transfer Learning / Vectorizing
    a.  Generally speaking, transfer learning is used with Vectorizer models which are easily available for download, such as GLOVE (search "GLOVE Stanford")
    b. These vectorizer models are hard to train but easy to use, especially with easy vocabs and small data sets. For larger data sets, you might need to add to their vocab
    c. Vectorizers make the model truly try to parse to sentence, which is great for complex problems such as this one, where we're trying to find the sentiment of a tweet
    d. Lastly, you input the vectors of N dimensions into some kind of ML solution, like XGBoost or LightGBM, or simpler models that can handle numerical values like SVM
3. Deep Learning (maybe with transfer learning?)
    a. This just uses transfer learning to convert text into Vectors, and then puts those Vectors as input into a Deep Learning algorithm
    b. One can also use some kind of LSTM to help with context, see the second Deep Learning algo for example
    c. Generally speaking, LSTM and GRU are the state of the art FOR SIMPLER MODELS. Anything more complex, and you get into Encoder-Decoder models (see below)
4. STEMMING/Lemmatization
    You can use the NLTK package to stem words, and also to Lemmatization. (Lemmatization is better than Stemming)
5. Recurrent Neural Networks
    Input sequences of varying length into a neural network
6. Encoder-Decoder models (Also called Sequence-To-Sequence or Seq2Seq)
    Won't spend much time on these since they're incredibly complicated, and more for language translation, etc. 
    BUT, https://keras.io/examples/nlp/lstm_seq2seq/ is a link to a recent, basic implementation of a Seq2Seq. 
    

In [1]:
%pip install np_utils
%pip install tqdm
%pip install xgboost

Collecting np_utils
  Downloading np_utils-0.6.0.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: np_utils
  Building wheel for np_utils (setup.py) ... [?25l[?25hdone
  Created wheel for np_utils: filename=np_utils-0.6.0-py3-none-any.whl size=56437 sha256=92546d0d7bad5916b19caa21803129e4de737560eaf349db7fb40f2c675e93b5
  Stored in directory: /root/.cache/pip/wheels/19/0d/33/eaa4dcda5799bcbb51733c0744970d10edb4b9add4f41beb43
Successfully built np_utils
Installing collected packages: np_utils
Successfully installed np_utils-0.6.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
import tensorflow.keras.utils as np_utils
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.metrics import multilabel_confusion_matrix

2025-10-16 16:39:54.355152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760632794.827214      76 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760632794.935928      76 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train = pd.read_csv('/kaggle/input/tweetsentiment/train.csv')
test = pd.read_csv('/kaggle/input/tweetsentiment/test.csv')

train.dropna(axis=0, inplace=True)

train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# Test tfIDF 

In [4]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [5]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.sentiment.values)

train_x = train.drop(['sentiment', 'textID', 'selected_text'], axis=1)
train_x.head()

Unnamed: 0,text
0,"I`d have responded, if I were going"
1,Sooo SAD I will miss you here in San Diego!!!
2,my boss is bullying me...
3,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t..."


In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_x.text, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [7]:
tfv = TfidfVectorizer(min_df=2, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), sublinear_tf=True, stop_words='english')

tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [8]:
print(xtrain_tfv)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 215623 stored elements and shape (24732, 25839)>
  Coords	Values
  (0, 23186)	0.4969094497616219
  (0, 14624)	0.455937339996419
  (0, 14593)	0.24651713964546493
  (0, 11122)	0.5114445370160733
  (0, 11073)	0.17769021050386846
  (0, 3438)	0.4373581086971128
  (1, 24407)	0.2745689665506929
  (1, 19846)	0.5154312415191148
  (1, 12679)	0.46548872186821805
  (1, 10789)	0.44927348188077587
  (1, 5982)	0.3306588230156463
  (1, 3546)	0.3620508927312193
  (2, 24926)	0.45004604359359124
  (2, 23143)	0.5424761049807123
  (2, 7016)	0.7093505721226357
  (3, 25335)	0.36387786204609246
  (3, 14665)	0.531111692026417
  (3, 14593)	0.2936935382031307
  (3, 8148)	0.558318416403921
  (3, 8129)	0.36387786204609246
  (3, 8025)	0.23479954246188478
  (4, 23779)	0.20317774588204116
  (4, 16334)	0.40354513991620555
  (4, 15549)	0.33938391461412465
  (4, 6159)	0.31175117893925297
  :	:
  (24729, 18840)	0.21534050519005996
  (24729, 15377)	0.3171426331

In [9]:
clf = LogisticRegression(C=1.0, solver='saga')
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.737 


# Transfer Learning

In [10]:
import os

embeddings_index = {}
with open('/kaggle/input/glove-vectorize-embedding/wiki_giga_2024_300_MFT20_vectors_seed_2024_alpha_0.75_eta_0.05_combined.txt') as f:
    try:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    except:
        f.__next__()
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 4927 word vectors.


In [11]:
def sent2vec(s):
    words = str(s).lower()#.decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    # This sum is VERY IMPORTANT. It ensures that the output vector is of fixed size, regardless of how many words are in the input. 
    # It also gets the overall vector for the whole sentence, not just individual words. Which is very helpful for reducing calculations.
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 24732/24732 [00:04<00:00, 5580.87it/s]
100%|██████████| 2748/2748 [00:00<00:00, 5705.90it/s]


In [12]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [13]:
# Fitting a simple xgboost on glove features

# clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
#                         subsample=0.8, learning_rate=0.1, silent=False, verbosity=1)
# clf.fit(xtrain_glove, ytrain)
# predictions = clf.predict_proba(xvalid_glove)

# print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

# Deep Learning

In [14]:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [15]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [16]:
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1760632820.531565      76 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1760632820.532208      76 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [17]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Epoch 1/5


I0000 00:00:1760632824.982135     133 service.cc:148] XLA service 0x7fefe0109f00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1760632824.983314     133 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1760632824.983333     133 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1760632825.357404     133 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 76/387[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 1.3574

I0000 00:00:1760632827.203599     133 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 1.1858 - val_loss: 0.9331
Epoch 2/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.9037 - val_loss: 0.9023
Epoch 3/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8828 - val_loss: 0.9003
Epoch 4/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8607 - val_loss: 0.8940
Epoch 5/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8441 - val_loss: 0.9020


<keras.src.callbacks.history.History at 0x7ff0adc4e410>

In [18]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

print(word_index)



In [19]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix)

100%|██████████| 26598/26598 [00:00<00:00, 874315.99it/s]

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.23294701 -0.039866    0.22511899 ... -0.115015   -0.213442
  -0.409242  ]
 [ 0.07615     0.39768001  0.49289101 ... -0.345256    0.13596199
  -0.325124  ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]





In [20]:
# # A simple LSTM with glove embeddings and two dense layers
# model = Sequential()
# model.add(Embedding(len(word_index) + 1,
#                      300,
#                      weights=[embedding_matrix],
#                      input_length=max_len,
#                      trainable=False))
# model.add(SpatialDropout1D(0.3))
# model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.8))

# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.8))

# model.add(Dense(3))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=40, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

In [21]:
# model = Sequential()
# model.add(Embedding(len(word_index) + 1,
#                      300,
#                      weights=[embedding_matrix],
#                      input_length=max_len,
#                      trainable=False))
# model.add(SpatialDropout1D(0.3))
# model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.8))

# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.8))

# model.add(Dense(3))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# # Fit the model with early stopping callback
# earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
# model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=50, 
#           verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

In [23]:
# # GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=20, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Epoch 1/20




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 314ms/step - loss: 1.1148 - val_loss: 0.9985
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 290ms/step - loss: 1.0114 - val_loss: 0.9383
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 291ms/step - loss: 0.9593 - val_loss: 0.8956
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 294ms/step - loss: 0.9366 - val_loss: 0.8783
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 289ms/step - loss: 0.9173 - val_loss: 0.8698
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 288ms/step - loss: 0.9021 - val_loss: 0.8491
Epoch 7/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 290ms/step - loss: 0.8911 - val_loss: 0.8527
Epoch 8/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 289ms/step - loss: 0.8741 - val_loss: 0.8425
Epoch 9/20
[1m49/49[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ff004d04dd0>

In [24]:
model.save('tweet_sentiment_model.keras')

In [25]:
import tensorflow as tf
new_model = tf.keras.models.load_model('tweet_sentiment_model.keras')
new_model.summary()