In [13]:
import re
def tokenize(text):
    # Split text into words using whitespace and punctuation as delimiters
    words = re.findall(r'\b\w+\b', text)
    return words

tokens = tokenize("i am ayoub, from ensia the national higher shchool of ai, I created this for learning purposes and help students when doing ai tasks")
tokens

['i',
 'am',
 'ayoub',
 'from',
 'ensia',
 'the',
 'national',
 'higher',
 'shchool',
 'of',
 'ai',
 'I',
 'created',
 'this',
 'for',
 'learning',
 'purposes',
 'and',
 'help',
 'students',
 'when',
 'doing',
 'ai',
 'tasks']

In [None]:
# Import the necessary functions
from torchtext.data.utils import get_tokenizer

text = "In the city of Dataville, a data analyst named Alex explores hidden insights within vast data. With determination, Alex uncovers patterns, cleanses the data, and unlocks innovation. Join this adventure to unleash the power of data-driven decisions."
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)


In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
filtered_tokens

['ayoub',
 'ensia',
 'national',
 'higher',
 'shchool',
 'ai',
 'created',
 'learning',
 'purposes',
 'help',
 'students',
 'ai',
 'tasks']

In [16]:
# stemming 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() # doesnt work for slangs like darija ...
stemmed_tokens = [stemmer.stem(token) for token in tokens]
stemmed_tokens

['i',
 'am',
 'ayoub',
 'from',
 'ensia',
 'the',
 'nation',
 'higher',
 'shchool',
 'of',
 'ai',
 'i',
 'creat',
 'thi',
 'for',
 'learn',
 'purpos',
 'and',
 'help',
 'student',
 'when',
 'do',
 'ai',
 'task']

In [17]:
# remove rare words not always needed tho 
from nltk.probability import FreqDist
freq_dist = FreqDist(stemmed_tokens)
threshold = 2
common_tokens = [token for token in stemmed_tokens if freq_dist[token]>= threshold]
common_tokens

['i', 'ai', 'i', 'ai']

# Encoding 

In [None]:
import torch
genres = ['Fiction','Non-fiction','Biography', 'Children','Mystery']

# Define the size of the vocabulary
vocab_size = len(genres)

# Create one-hot vectors
one_hot_vectors = torch.eye(vocab_size)

# Create a dictionary mapping genres to their one-hot vectors
one_hot_dict = {genre: one_hot_vectors[i] for i, genre in enumerate(genres)}

for genre, vector in one_hot_dict.items():
    print(f'{genre}: {vector.numpy()}')

In [21]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

CountVect = CountVectorizer()
TfidfVect = TfidfVectorizer()
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "The sky is blue and the sun is shining brightly.",
    "I enjoy reading books and listening to music in my free time.",
    "Learning new things every day keeps life interesting.",
    "Coffee is my favorite drink, especially in the morning."
]

bagOfwords = CountVect.fit_transform(corpus)
vectorized_x = TfidfVect.fit_transform(corpus)
print(CountVect.get_feature_names_out()[:5])
print(bagOfwords.toarray()[0, :5])


array([[0.        , 0.        , 0.        , 0.        , 0.33721386,
        0.        , 0.        , 0.33721386, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.        ,
        0.33721386, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.33721386,
        0.        , 0.        , 0.        , 0.        , 0.4516721 ,
        0.        , 0.        , 0.        ],
       [0.25451241, 0.31546157, 0.        , 0.31546157, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.50902482, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.31546157, 0.31546157, 0.31546157, 0.42253658,
   

# Preprocessing Pipeline:

## Helper functions 

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

def encode_sentences(sentences):
    print("Preprocessed sentences:", sentences)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data)
    return sentences

def preprocess_sentences(sentences):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = word_tokenize(sentence)
        tokens = [token for token in tokens if token.lower() not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        # freq_dist = FreqDist(tokens)
        # threshold = 2
        # tokens = [token for token in tokens if freq_dist[token] > threshold]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences


In [31]:
from torch.utils.data import Dataset, DataLoader
import torch

# Define a custom PyTorch dataset class
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def text_processing_pipeline(text):
    # Preprocess the text
    tokens = preprocess_sentences(text)
    
    # Encode the preprocessed sentences
    encoded_sentences, vectorizer = encode_sentences(tokens)
    
    # Create a PyTorch dataset
    dataset = TextDataset(encoded_sentences)
    
    # Create a dataloader
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    return dataloader, vectorizer


In [32]:
text_data = "This is the first text data. And here is another one."
sentences = extract_sentences(text_data)
dataloader, vectorizer = text_processing_pipeline(sentences)
print(next(iter(dataloader))[0, :10])

Preprocessed sentences: ['first text data .', 'anoth one .']
tensor([0, 1, 1, 0, 1])


# CNN for Text

In [2]:
import torch
from torch import nn
words = ['the','cat','sat','on','the','mat']
word_to_idx = {word: idx for idx, word in enumerate((words))}
inputs = torch.LongTensor([word_to_idx[w] for w in words])

embedding = nn.Embedding(num_embeddings =len(words), embedding_dim=10)
output = embedding(inputs)

print(output)

tensor([[-0.2718, -1.0707,  0.4272,  0.7276, -0.1287,  0.6916,  1.3477,  1.9173,
         -0.6723, -0.1657],
        [ 0.0402,  1.5819, -0.9921, -1.3865,  0.4599, -1.2141,  0.0219, -1.0111,
          0.1189, -0.6732],
        [ 1.5348,  0.2376,  0.8647,  0.4961,  1.2731,  0.0022, -1.0568,  1.2796,
         -0.8611,  1.5722],
        [-0.5900, -0.0346,  0.0956, -0.6735, -1.2555, -0.5936,  1.4562,  0.5631,
          0.6267,  0.4761],
        [-0.2718, -1.0707,  0.4272,  0.7276, -0.1287,  0.6916,  1.3477,  1.9173,
         -0.6723, -0.1657],
        [-0.6252, -0.7574,  1.2452,  0.5137,  1.0739, -0.1121, -1.1330,  0.3104,
          1.2396,  0.6651]], grad_fn=<EmbeddingBackward0>)


in cnns a **filter** is a small matrix we slide over the input 
and **stride** is the number of positions the filter moves 

In [6]:
import torch.nn.functional as F
class SentimentAnalysisCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc =  nn.Linear(embed_dim, 2)

    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)


In [30]:
import torch.optim as optim
vocab = ["i","slow", "love","shallow","predictable", "this", "book", "do", "not","adored","absolutely","breathtaking", "like"]
word_to_idx = {word: i for i, word in enumerate(vocab)}
vocab_size = len(word_to_idx)
embed_dim = 10

book_samples = [
    ("The story was captivating and kept me hooked until the end.love".split(), 1),
    ("I found the characters shallow and the plot predictable.".split(), 0)
]

model = SentimentAnalysisCNN(vocab_size, embed_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


In [31]:
# Sample sentences with labels
data = [
    ("The story was captivating and kept me hooked until the end.love".split(), 1),
    ("I found the characters shallow and the plot predictable.".split(), 0),
    ("This movie is a masterpiece of storytelling.".split(), 1),
    ("The acting was terrible and the dialogue felt forced.".split(), 0),
    ("I couldn't stop laughing throughout the entire film.love".split(), 1),
    ("The special effects were impressive, but the story lacked depth.".split(), 0),
    ("The soundtrack perfectly complemented the mood of the film.".split(), 1),
    ("The pacing was too slow and made the movie drag on.".split(), 0),
    ("I absolutely adored this book. It's a must-read!".split(), 1),
    ("The ending of the movie left me in tears.".split(), 1),
    ("The plot twists kept me on the edge of my seat.".split(), 1),
    ("The writing style was too verbose for my liking.".split(), 0),
    ("The cinematography in this film is breathtaking.".split(), 1),
    ("The dialogue felt stilted and unnatural.".split(), 0),
    ("The protagonist was relatable and well-developed.".split(), 1),
    ("The pacing of the story felt rushed and disjointed.".split(), 0),
    ("I couldn't put this book down until I finished it.".split(), 1),
    ("The special effects were unconvincing and cheesy.".split(), 0),
    ("The character development was shallow and one-dimensional.".split(), 0),
    ("The soundtrack added depth and emotion to the scenes.".split(), 1),
    ("I was disappointed by the lackluster ending.".split(), 0),
    ("The acting performances were stellar across the board.".split(), 1),
    ("The story had me guessing until the very end.".split(), 1),
    ("The dialogue was witty and engaging.".split(), 1),
    ("The pacing of the film was too slow for my taste.".split(), 0),
    ("The setting was vividly described and immersive.".split(), 1),
    ("The plot was predictable and clichéd.".split(), 0),
    ("The emotional depth of the characters resonated with me.".split(), 1)

]

# Printing the data
for sentence, label in data:
    print(sentence, label)

for epoch in range(10):
    for sentence, label in data:
        model.zero_grad()
        sentence = torch.LongTensor([word_to_idx.get(w, 0) for w in sentence]).unsqueeze(0)
        outputs = model(sentence)
        label = torch.LongTensor([int(label)])
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()


['The', 'story', 'was', 'captivating', 'and', 'kept', 'me', 'hooked', 'until', 'the', 'end.love'] 1
['I', 'found', 'the', 'characters', 'shallow', 'and', 'the', 'plot', 'predictable.'] 0
['This', 'movie', 'is', 'a', 'masterpiece', 'of', 'storytelling.'] 1
['The', 'acting', 'was', 'terrible', 'and', 'the', 'dialogue', 'felt', 'forced.'] 0
['I', "couldn't", 'stop', 'laughing', 'throughout', 'the', 'entire', 'film.love'] 1
['The', 'special', 'effects', 'were', 'impressive,', 'but', 'the', 'story', 'lacked', 'depth.'] 0
['The', 'soundtrack', 'perfectly', 'complemented', 'the', 'mood', 'of', 'the', 'film.'] 1
['The', 'pacing', 'was', 'too', 'slow', 'and', 'made', 'the', 'movie', 'drag', 'on.'] 0
['I', 'absolutely', 'adored', 'this', 'book.', "It's", 'a', 'must-read!'] 1
['The', 'ending', 'of', 'the', 'movie', 'left', 'me', 'in', 'tears.'] 1
['The', 'plot', 'twists', 'kept', 'me', 'on', 'the', 'edge', 'of', 'my', 'seat.'] 1
['The', 'writing', 'style', 'was', 'too', 'verbose', 'for', 'my', 'l

In [32]:
for sample in book_samples:
    input_tensor = torch.tensor([word_to_idx[w] if w in word_to_idx else 0 for w in sample[0]], dtype=torch.long).unsqueeze(0)

    outputs = model(input_tensor)
    _, predicted_label = torch.max(outputs.data, 1)
    print(print("the predicted label is ", predicted_label))
    sentiment = "Positive" if predicted_label.item() == 1 else "Negative"
    print(f"Book Review: {' '.join(sample[0])}")
    print(f"Sentiment: {sentiment}\n")


the predicted label is  tensor([1])
None
Book Review: The story was captivating and kept me hooked until the end.love
Sentiment: Positive

the predicted label is  tensor([0])
None
Book Review: I found the characters shallow and the plot predictable.
Sentiment: Negative



# RNNs for text

rnns are suitable of text classification they can read text as humans one word at a time allowing to capture context and order of words

sometimes the text may contain complex sentences with diffrent sentiments, lstms excels in them

GRU can quickly recognize spammy patterns without needing the full text 

In [33]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text):
        self.text = text
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        return self.text[idx]


In [34]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        _, (hidden, _) = self.lstm(x) 
        # the first represent the output of the lstm at each step
        # used in seq2seq and the second is the cell state
        output = self.fc(hidden.squeeze(0))
        return output


In [35]:
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        _, hidden = self.gru(x)
        output = self.fc(hidden.squeeze(0))
        return output


In [None]:
lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.01)

# Train the model by passing the correct parameters and zeroing the gradient
for epoch in range(10): 
    optimizer.zero_grad()
    outputs = lstm_model(X_train_seq)
    loss = criterion(outputs, y_train_seq)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

In [None]:
outputs = rnn_model(X_test_seq)
_, predicted = torch.max(outputs, 1)

# Calculate the metrics
accuracy_score = accuracy(outputs, y_test_seq)
precision_score = precision(outputs, y_test_seq)
recall_score = recall(outputs, y_test_seq)
f1_score = f1(outputs, y_test_seq)
print("RNN Model - Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(accuracy_score, precision_score, recall_score, f1_score))

In [None]:
# Create an instance of the metrics
accuracy = Accuracy(task="multiclass", num_classes=3)
precision = Precision(task="multiclass", num_classes=3)
recall = Recall(task="multiclass", num_classes=3)
f1 = F1Score(task="multiclass", num_classes=3)

# Calculate metrics for the LSTM model
accuracy_1 = accuracy(y_test, y_pred_lstm)
precision_1 = precision(y_test, y_pred_lstm)
recall_1 = recall(y_test, y_pred_lstm)
f1_1 = f1(y_test, y_pred_lstm)
print("LSTM Model - Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(accuracy_1, precision_1, recall_1, f1_1))

# Calculate metrics for the GRU model
accuracy_2 = accuracy(y_test, y_pred_gru)
precision_2 = precision(y_test, y_pred_gru)
recall_2 = recall(y_test, y_pred_gru)
f1_2 = f1(y_test, y_pred_gru)
print("GRU Model - Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(accuracy_2, precision_2, recall_2, f1_2))