<a href="https://colab.research.google.com/github/Hadeel-77/LLM/blob/main/Learning_From_The_Scratch_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install numpy==1.26.4


In [None]:
import spacy as sc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from spacy.util import minibatch
from spacy.training.example import Example
import random



In [None]:
!pip install -U spacy
!pip install -U spacy[transformers]


In [None]:
# Download the transformer-based model
!python -m spacy download en_core_web_trf

In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tweets.csv')

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


# Prepare The Data

In [None]:
# we want only the positive vs the negative
df=df[df["airline_sentiment"]!='neutral']

# convert the categorical lables into integers using map function , for easy conversion to spaCy dictionary later on
df["airline_sentiment"]=df["airline_sentiment"].map({"negative":0,"positive":1})

In [None]:
# Make sure the dataframe does'nt contain null values
df["airline_sentiment"].unique()

array([1, 0])

In [None]:
# imbalanced calsses
df["airline_sentiment"].value_counts()


Unnamed: 0_level_0,count
airline_sentiment,Unnamed: 1_level_1
0,9178
1,2363


In [None]:
from sklearn.utils import resample

# undersample the majority class tto have balanced data set:

df_positive = df[df["airline_sentiment"] == 1]
df_negative = df[df["airline_sentiment"] == 0]

df_negative_downsampled = resample(df_negative,
                                   replace=False,              # don't have duplicate
                                   n_samples=len(df_positive), # match the length of the positive class
                                   random_state=42)

df_balanced = pd.concat([df_positive, df_negative_downsampled])


In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_balanced["text"],df_balanced["airline_sentiment"],test_size=0.2,random_state=42)

# Learning From The Scratch  

1. Customize the configuration of your model which will contain all the pipeline components( embeding initialization & random assigned weights )
2.   Create new pipeline - no pretrained model
3.   Convert the trained data into expected spaCy format

*  bool(1) → True
*  not bool(1)→ False
4. Start the learning process using an optimizer Stochastic Gradient Descent by default
5. Test & Evaluate




In [None]:
# Config is the container of all the pipeline components
config = {

      # when the predicted probabilty of a score is >= 5 it labeled as positive comment

    "threshold": 0.5,
      # TextCatEnsemble.v2 => combines the two models : 1- bag of word model , 2- token vector
    "model": {
        "@architectures": "spacy.TextCatEnsemble.v2",
        "linear_model": {

            # bag of word model is to average token vectors
            # predict the target word based on the context of surrounding words
            #TextCatBOW.v2 automatically creates its own instance of MultiHashEmbed.v2 internally.
            # It does not reuse or share the embed defined inside tok2vec.

            "@architectures": "spacy.TextCatBOW.v2",

            # either positive or negative labe , only one label is true per input
            "exclusive_classes": True,

            # only use unigram single word
            "ngram_size": 1,

            # adds an output layer otherwise we need to customize our output layer
            "no_output_layer": False,
        },

        # Trainable neural feature extractor ,learns contextual embeddings.
        "tok2vec": {
            "@architectures": "spacy.Tok2Vec.v2",
            "embed": {

                "@architectures": "spacy.MultiHashEmbed.v2",

                # embeding vector size
                "width": 96,

                # number of hash buckets for each attribute,Each feature is embedded independently using its own hash table of that size.

                "rows": [2000, 2000, 100, 100, 100],

                # To initilize embedings based on random weights

                # NORM, LOWER: normalized & lowercase text
                # PREFIX, SUFFIX: subword patterns
                # SHAPE: capitalization, punctuation patterns

                "attrs": ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"],
            },

            "encode": {
                # after encoding its form we encode its context

                "@architectures": "spacy.MaxoutWindowEncoder.v2",
                "width": 96,
                # looking at neighboring tokens
                "window_size": 4,
                # the activation function that take the maximum of multiple activation function
                "maxout_pieces": 3,
                "depth": 4,
            },
        },
    },
}

In [None]:
import spacy

nlp = spacy.blank("en") # create an english pipeline for our customized text classificaation model
textcat = nlp.add_pipe("textcat", config=config)
textcat.add_label("positive")
textcat.add_label("negative")

1

In [None]:
# Here we are mapping the numerical values 1,0 back to their categorical values True , False
# X_train is the text , cats is the dictionary keyword , 2 lables with crossbonding True - False values

train_data = list(zip( X_train,
 [{"cats": {"positive": bool(label), "negative": not bool(label)}} for label in y_train
 ]))
# so that the model won't memorize the trained data
random.shuffle(train_data)

# Set up the model’s weights and internal state .

In [None]:
# start the learning process,It's used by nlp.update() to adjust weights via gradient descent.

optimizer = nlp.begin_training()

# 10 passes over the dataset as it modifies its parameter , repetitive learning

for epoch in range(10):
# saves the errors
    losses = {}
    # The dataset is shuffled to be prevented from being memorized
    random.shuffle(train_data)
    # 8 small batches of training example in each iteration
    batches = minibatch(train_data, size=8)
    for batch in batches:
    # Unpack the batch into features and labels

        texts, annotations = zip(*batch)

        # convert (text,label) into example object so the model can be trained on

        examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]

        # forward pass → compute loss → backpropagation → weight update
        nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)

    print(f"Epoch {epoch + 1}: Loss = {losses['textcat']}")

Epoch 1: Loss = 72.20071495667798
Epoch 2: Loss = 44.458150856997236
Epoch 3: Loss = 30.610034069625897
Epoch 4: Loss = 22.760255583361722
Epoch 5: Loss = 16.140272661297754
Epoch 6: Loss = 12.053385207010683
Epoch 7: Loss = 9.982947745847525
Epoch 8: Loss = 7.8311275366980215
Epoch 9: Loss = 7.09843571543607
Epoch 10: Loss = 8.40094200764756


In [None]:
doc = nlp("The staff didn't do their job properly")
print(doc.cats)

{'positive': 1.5877827763688401e-06, 'negative': 0.9999984502792358}


In [None]:
doc = nlp("i liked the service")
print(doc.cats)

{'positive': 8.344089110323694e-06, 'negative': 0.9999916553497314}
