<a href="https://colab.research.google.com/github/MainKlain/myFirstModelinPython/blob/main/HuggingFaceModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Science with Python


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import pandas as pd
import tensorflow as tf
from tensorflow import keras
activations = keras.activations
losses = keras.losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle

df = pd.read_csv('/content/drive/MyDrive/Python/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Printing the shape of the DataFrame
print(df.shape)

# Mapping the labels 'ham' and 'spam' to numeric values 0 and 1, respectively
# This is done to convert the labels into a format suitable for machine learning models
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Displaying the first few rows of the DataFrame to verify the changes
print(df.head())

# Extracting the 'message' column from the DataFrame and converting it to a list
x = list(df['message'])

# Extracting the 'label' column from the DataFrame and converting it to a list
y = list(df['label'])

# Define the model name and maximum length of tokens
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 20

# Select the first message from the dataset
message = x[0]

# Load the tokenizer for the specified DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Tokenize the message using the tokenizer, ensuring it has a maximum length of MAX_LEN
inputs = tokenizer(message, max_length=MAX_LEN, truncation=True, padding=True)

# Print the original message, tokenized input IDs, and attention mask
print(f'message: \'{message}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

def construct_encodings(x, tkzr, max_len, truncation=True, padding=True):
    """
    Function to construct encodings for input sequences using a tokenizer.

    Args:
    - x: Input sequences to be encoded.
    - tkzr: Tokenizer object used for encoding.
    - max_len: Maximum length of the encoded sequences.
    - trucation: Whether to truncate sequences to `max_len`.
    - padding: Whether to pad sequences to `max_len`.

    Returns:
    - Encodings: Encoded representations of the input sequences.

    """
    # Use the tokenizer to encode the input sequences
    encodings = tkzr(
        x,
        max_length=max_len,  # Set the maximum length of the encoded sequences
        truncation=truncation,  # Truncate sequences if they exceed `max_len`
        padding=padding  # Pad sequences if they are shorter than `max_len`
    )

    return encodings

# Call the construct_encodings function to encode input sequences
encodings = construct_encodings(x, tokenizer, max_len=MAX_LEN)
print("This are the encodings:", encodings)

# Define a function to construct a TensorFlow Dataset from token encodings
# and labels (if provided)
def construct_tfdataset(encodings, y=None):
    # If labels are provided (during training or evaluation)
    if y:
        # Create a TensorFlow Dataset from token encodings and labels
        # by slicing the encodings dictionary and combining it with the labels
        return tf.data.Dataset.from_tensor_slices((dict(encodings), y))
    else:
        # If labels are not provided (during inference/prediction)
        # This case is used when making predictions on unseen samples after training
        # Create a TensorFlow Dataset only from token encodings
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

# Call the construct_tfdataset function with token encodings and labels (if available)
tfdataset = construct_tfdataset(encodings, y)
print("This are the Tensorflow dataset:", tfdataset)

# Define the ratio of the dataset to be used for testing
TEST_SPLIT = 0.2

# Define the batch size for training and testing data
BATCH_SIZE = 32

# Calculate the size of the training set based on the test split ratio
train_size = int(len(x) * (1 - TEST_SPLIT))

# Shuffle the dataset to ensure randomness in training and testing samples
tfdataset = tfdataset.shuffle(len(x))

# Split the dataset into training and testing sets based on the calculated size
tfdataset_train = tfdataset.take(train_size)  # Take the first `train_size` samples for training
tfdataset_test = tfdataset.skip(train_size)    # Skip the first `train_size` samples for testing

# Batch the training and testing datasets using the specified batch size
tfdataset_train = tfdataset_train.batch(BATCH_SIZE)  # Batch the training set
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)    # Batch the testing set

# Define the number of epochs for training
N_EPOCHS = 5

# Load the pre-trained DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

# Define the loss function for the model
loss = losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model with the specified loss function and evaluation metric
model.compile(loss=loss, metrics=['accuracy'])

# Train the model on the training dataset
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

# Evaluate the trained model on the test dataset
# The model.evaluate() function computes performance metrics such as loss and accuracy
# It takes the test dataset (tfdataset_test) as input and returns evaluation results
# We set return_dict=True to return evaluation results as a dictionary
# The batch_size parameter specifies the number of samples processed per
# batch during evaluation
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)

# Print the evaluation results
print(benchmarks)

# Define a function to create a predictor for text classification
def create_predictor(model, model_name, max_len):
    # Load the tokenizer for the specified pre-trained DistilBERT model
    tkzr = DistilBertTokenizer.from_pretrained(model_name)

    # Define a nested function to predict probabilities for text input
    def predict_proba(text):
        # Preprocess the text input
        x = [text]

        # Construct input encodings using the tokenizer
        encodings = construct_encodings(x, tkzr, max_len=max_len)

        # Construct a TensorFlow dataset from the input encodings
        tfdataset = construct_tfdataset(encodings)

        # Batch the dataset with a batch size of 1
        tfdataset = tfdataset.batch(1)

        # Use the provided model to predict logits for the text input
        preds = model.predict(tfdataset).logits

        # Apply softmax activation to convert logits to probabilities
        preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()

        # Return the predicted probability of the positive class
        return preds[0][0]

    # Return the nested prediction function
    return predict_proba

# Create a predictor function using the provided model, model name,
# and maximum sequence length
clf = create_predictor(model, MODEL_NAME, MAX_LEN)

# Test the predictor function with a sample text input and print the predicted probability
print(clf('New Job opportunity for you in Dubai'))

# Saving the trained model to the specified directory
model.save_pretrained('./model/clf')

# Saving metadata (such as model name and maximum length) using pickle
# Here, MODEL_NAME and MAX_LEN are assumed to be variables containing relevant information
# 'wb' mode is used to write binary data to the file
with open('./model/info.pkl', 'wb') as f:
    # Dumping the tuple containing metadata into the pickle file
    pickle.dump((MODEL_NAME, MAX_LEN), f)

# Load the pre-trained DistilBERT model from the specified directory
new_model = TFDistilBertForSequenceClassification.from_pretrained('./model/clf')
# Load additional information such as the model name and maximum sequence
# length from a pickled file
model_name, max_len = pickle.load(open('./model/info.pkl', 'rb'))


# Create a predictor function using the loaded model, model name,
# and maximum sequence length
clf = create_predictor(new_model, model_name, max_len)

# Test the predictor function with a sample text input
print(clf('NEw Job opportunity for you in Dubai'))




Mounted at /content/drive/
(5572, 2)
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

message: 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
input ids: [101, 2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 102]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
This are the encodings: {'input_ids': [[101, 2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 102], [101, 7929, 2474, 2099, 1012, 1012, 1012, 16644, 15536, 2546, 1057, 2006, 2072, 1012, 1012, 1012, 102, 0, 0, 0], [101, 2489, 4443, 1999, 1016, 1037, 1059, 2243, 2135, 4012, 2361, 2000, 2663, 6904, 2452, 2345, 1056, 25509, 2015, 102], [101, 1057, 24654, 2360, 2061, 2220, 7570, 2099, 1012, 1012, 1012, 1057, 1039, 2525, 2059, 2360, 1012, 1012, 1012, 102], [101, 20976, 1045, 2123, 1005, 1056, 2228, 2002, 3632, 2000, 2149, 2546, 1010, 2002, 3268, 2105, 2182, 2295, 102, 0], [101, 2489, 5244, 2290, 4931, 20

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'loss': 0.3970646858215332, 'accuracy': 0.8717488646507263}
0.8090471


Some layers from the model checkpoint at ./model/clf were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./model/clf and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.8090471
