<a href="https://colab.research.google.com/github/Hadeel-77/LLM/blob/main/Full_Fine_Tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets tensorflow scikit-learn


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Lambda





In [None]:
from google.colab import drive
drive.mount('/content/drive')

df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tweets.csv')

# Step 1: Data Preprocessing

In [None]:


# Keep only positive & negative samples
df = df[df['airline_sentiment'].isin(['positive', 'negative'])].reset_index(drop=True)

# Label encode: negative=0, positive=1

le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])

# Split texts and labels

train_texts, test_texts, y_train, y_test = train_test_split(
    df['text'].tolist(),
    df['label'].values,
    test_size=0.2,
    random_state=42
)


# Step 2: Text Tokenization

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts (pad & truncate to max_len=128)
# Padding => Used when a text is shorter than the specified max_length,Adds special [PAD] tokens to the end of the input
# so that all sequences have the same length.

# Truncate => Used when a text is longer than max_length, It cuts off tokens beyond the specified maximum.


train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='tf'
)

# Step 3 : Invoke The Model , Feed The model The Inputs & Extract The CLS Tag

In [None]:
# Load HuggingFace pre-trained BERT base model, we will use it as feature extractor,no classification head

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Function that extract CLS tag

def bert_encode(inputs):
    input_ids, attention_mask = inputs
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)

    cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token , summarize the sentence embeddings
    return cls_output

# Define model inputs

input_ids = Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(128,), dtype=tf.int32, name='attention_mask')
cls_token = Lambda(bert_encode, output_shape=(768,))([input_ids, attention_mask])

# step 4 : Adding our own classification layer to perform specilized task

In [None]:
#  Add classification head
x = Dropout(0.3)(cls_token) #drop 30% of the inputs to reduce overfitting
x = Dense(64, activation='relu')(x) # 64 nurons to learn features
x = Dropout(0.2)(x) # drop 20% to reduce overfitting in an intermediate layer
output = Dense(1, activation='sigmoid')(x)  # Sinle nueron classify either 0 or 1

# Step 5 : Build, Compile & Train The Model

In [None]:
# Build & compile the model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(
    x={'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y=y_train,
    validation_split=0.1,
    epochs=3,
    batch_size=32
)

# Step 6 : Evaluate The Model

In [None]:
loss, accuracy = model.evaluate(
    x={'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y=y_test
)

print(f"\n✅ Test Accuracy: {accuracy:.4f}")



# Step 7 : Build Prediction Function

In [None]:
def predict_sentiment(text, tokenizer, model):
    # Tokenize input
    encoding = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    # Run prediction
    prediction = model.predict({
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask']
    })

    # Get probability
    prob = float(prediction[0][0])

    # Convert to label
    label = "positive" if prob >= 0.5 else "negative"
    return label, prob


In [None]:
example_1 = "I love flying with this airline. Always a great experience!"
example_2 = "Terrible service, I will never book with them again."

print(predict_sentiment(example_1, tokenizer, model))  # ➜ ('positive', 0.87)
print(predict_sentiment(example_2, tokenizer, model))  # ➜ ('negative', 0.12)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step
('positive', 0.5715762376785278)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
('negative', 0.15400893986225128)
