In [13]:
import re
def tokenize(text):
    # Split text into words using whitespace and punctuation as delimiters
    words = re.findall(r'\b\w+\b', text)
    return words

tokens = tokenize("i am ayoub, from ensia the national higher shchool of ai, I created this for learning purposes and help students when doing ai tasks")
tokens

['i',
 'am',
 'ayoub',
 'from',
 'ensia',
 'the',
 'national',
 'higher',
 'shchool',
 'of',
 'ai',
 'I',
 'created',
 'this',
 'for',
 'learning',
 'purposes',
 'and',
 'help',
 'students',
 'when',
 'doing',
 'ai',
 'tasks']

In [None]:
# Import the necessary functions
from torchtext.data.utils import get_tokenizer

text = "In the city of Dataville, a data analyst named Alex explores hidden insights within vast data. With determination, Alex uncovers patterns, cleanses the data, and unlocks innovation. Join this adventure to unleash the power of data-driven decisions."
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)


In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
filtered_tokens

['ayoub',
 'ensia',
 'national',
 'higher',
 'shchool',
 'ai',
 'created',
 'learning',
 'purposes',
 'help',
 'students',
 'ai',
 'tasks']

In [16]:
# stemming 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() # doesnt work for slangs like darija ...
stemmed_tokens = [stemmer.stem(token) for token in tokens]
stemmed_tokens

['i',
 'am',
 'ayoub',
 'from',
 'ensia',
 'the',
 'nation',
 'higher',
 'shchool',
 'of',
 'ai',
 'i',
 'creat',
 'thi',
 'for',
 'learn',
 'purpos',
 'and',
 'help',
 'student',
 'when',
 'do',
 'ai',
 'task']

In [17]:
# remove rare words not always needed tho 
from nltk.probability import FreqDist
freq_dist = FreqDist(stemmed_tokens)
threshold = 2
common_tokens = [token for token in stemmed_tokens if freq_dist[token]>= threshold]
common_tokens

['i', 'ai', 'i', 'ai']

# Encoding 

In [None]:
import torch
genres = ['Fiction','Non-fiction','Biography', 'Children','Mystery']

# Define the size of the vocabulary
vocab_size = len(genres)

# Create one-hot vectors
one_hot_vectors = torch.eye(vocab_size)

# Create a dictionary mapping genres to their one-hot vectors
one_hot_dict = {genre: one_hot_vectors[i] for i, genre in enumerate(genres)}

for genre, vector in one_hot_dict.items():
    print(f'{genre}: {vector.numpy()}')

In [21]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

CountVect = CountVectorizer()
TfidfVect = TfidfVectorizer()
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "The sky is blue and the sun is shining brightly.",
    "I enjoy reading books and listening to music in my free time.",
    "Learning new things every day keeps life interesting.",
    "Coffee is my favorite drink, especially in the morning."
]

bagOfwords = CountVect.fit_transform(corpus)
vectorized_x = TfidfVect.fit_transform(corpus)
print(CountVect.get_feature_names_out()[:5])
print(bagOfwords.toarray()[0, :5])


array([[0.        , 0.        , 0.        , 0.        , 0.33721386,
        0.        , 0.        , 0.33721386, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.        ,
        0.33721386, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33721386, 0.33721386,
        0.        , 0.        , 0.        , 0.        , 0.4516721 ,
        0.        , 0.        , 0.        ],
       [0.25451241, 0.31546157, 0.        , 0.31546157, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.50902482, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.31546157, 0.31546157, 0.31546157, 0.42253658,
   

# Preprocessing Pipeline:

## Helper functions 

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

def encode_sentences(sentences):
    print("Preprocessed sentences:", sentences)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data)
    return sentences

def preprocess_sentences(sentences):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = word_tokenize(sentence)
        tokens = [token for token in tokens if token.lower() not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        # freq_dist = FreqDist(tokens)
        # threshold = 2
        # tokens = [token for token in tokens if freq_dist[token] > threshold]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences


In [31]:
from torch.utils.data import Dataset, DataLoader
import torch

# Define a custom PyTorch dataset class
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def text_processing_pipeline(text):
    # Preprocess the text
    tokens = preprocess_sentences(text)
    
    # Encode the preprocessed sentences
    encoded_sentences, vectorizer = encode_sentences(tokens)
    
    # Create a PyTorch dataset
    dataset = TextDataset(encoded_sentences)
    
    # Create a dataloader
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    return dataloader, vectorizer


In [32]:
text_data = "This is the first text data. And here is another one."
sentences = extract_sentences(text_data)
dataloader, vectorizer = text_processing_pipeline(sentences)
print(next(iter(dataloader))[0, :10])

Preprocessed sentences: ['first text data .', 'anoth one .']
tensor([0, 1, 1, 0, 1])


['This is the first text data.', 'And here is another one.']