<a href="https://colab.research.google.com/github/GtmAadarshaOmega/Facial-Expression-Recognition-FER-/blob/main/Lekhooa_David_Montoeli_Chatbot_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Necessary Libraries

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Data Preprocessing:

Removing Numbers and Special Characters: You can further clean the text by removing numbers and special characters that might not be relevant to the intent recognition task.

Stemming or Lemmatization: Depending on your use case, you can apply stemming or lemmatization to reduce words to their base forms.

Handling Linguistic Flags:
Depending on the linguistic flags, you can adapt responses in more complex ways. For example, for flag 'L' (Lexical variation - synonyms), you can replace words with synonyms using a thesaurus library like NLTK's WordNet.

Saving Preprocessed Data in JSON Format: Instead of saving the preprocessed data as a CSV, you can save it in JSON format, which is more flexible for handling nested data structures if needed.

In [None]:
# Sample dataset
data = pd.read_csv('/Intentrecognition.csv')  # Replace with your dataset file


# Data Cleaning
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    # Apply stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in filtered_words]
    # Join words back into text
    return ' '.join(words)

data['utterance_cleaned'] = data['utterance'].apply(clean_text)

# Text Tokenization
data['tokens'] = data['utterance_cleaned'].apply(word_tokenize)

# Linguistic Flags and Response Adaptation
def adapt_response(text, flags):
    if 'P' in flags:
        text = "Thank you for your question."
    if 'Q' in flags:
        text = "Sure thing, ask away!"

    # Synonym Replacement
    def synonym_replacement(tokens):
        new_tokens = []
        for token in tokens:
            if token not in stopwords.words('english'):
                synonyms = wn.synsets(token)
                if synonyms:
                    synonym = synonyms[0].lemmas()[0].name()
                    new_tokens.append(synonym)
                else:
                    new_tokens.append(token)
            else:
                new_tokens.append(token)
        return new_tokens

    if 'L' in flags:
        tokens = word_tokenize(text)
        new_tokens = synonym_replacement(tokens)
        text = ' '.join(new_tokens)

    return text

data['bot_response'] = data.apply(lambda row: adapt_response(row['utterance'], row['flags']), axis=1)

# Save preprocessed data as JSON
data.to_json('preprocessed_dataset.json', orient='records', lines=True)


Code Summary:

Data Loading: The code begins by loading the preprocessed dataset in JSON format using pd.read_json.

Tokenizer Definition: A tokenizer specific to the transformer-based model (e.g., GPT-2) is defined using transformers.GPT2Tokenizer.from_pretrained.

Data Splitting: The dataset is split into training, validation, and testing sets using train_test_split from sklearn.model_selection.

Text Encoding: An encode_text function is defined to tokenize and encode text using the transformer model's tokenizer. It handles padding, truncation, and other necessary transformations.

Encoding Application: Tokenization and encoding are applied to the training, validation, and testing data by adding a new column with input IDs.

Label Encoding: Intent labels are converted into numerical format using label encoding. A single LabelEncoder instance is used for both training and validation labels.

Data Saving: The preprocessed data for each split (training, validation, testing) is saved in JSON format using to_json.

Model Artifacts Saving: The LabelEncoder and tokenizer are saved using joblib, which is an efficient method for saving and loading Python objects.



In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.7 MB/s[0m eta [36m0:00:0

In [None]:
pip install --upgrade pandas

Collecting pandas
  Downloading pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.1.1 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.1.1 tzdata-2023.3


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer  # Assuming you're using a transformer-based model
from sklearn.preprocessing import LabelEncoder
import joblib
import h5py

# Load your preprocessed dataset
data = pd.read_json('preprocessed_dataset.json', orient='records', lines=True)

# Define your model-specific tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add a padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # You can choose any suitable token for padding

# Split the dataset into training, validation, and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Tokenize and encode the text data
def encode_text(text):
    encoding = tokenizer.encode(
        text, add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_tensors='np')  # Use 'np' to return a NumPy array
    return encoding[0]  # Convert to 1D array

# Apply tokenization and encoding to training, validation, and testing data
train_data['input_ids'] = train_data['utterance_cleaned'].apply(encode_text)
valid_data['input_ids'] = valid_data['utterance_cleaned'].apply(encode_text)
test_data['input_ids'] = test_data['utterance_cleaned'].apply(encode_text)

# Define target labels (intents) for training and validation
train_labels = train_data['intent']
valid_labels = valid_data['intent']

# Use LabelEncoder to convert labels to numerical format
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
valid_labels_encoded = label_encoder.transform(valid_labels)

# Save the label encoder for future use
joblib.dump(label_encoder, 'label_encoder.pkl')

# Convert the Pandas DataFrames to NumPy arrays
train_data_array = np.vstack(train_data['input_ids'].to_numpy())
valid_data_array = np.vstack(valid_data['input_ids'].to_numpy())
test_data_array = np.vstack(test_data['input_ids'].to_numpy())




import numpy as np

# Assuming train_data, valid_data, and test_data are your data arrays

# Function to check if a string is numeric
def is_numeric(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# Vectorize the function to work with NumPy arrays
is_numeric_vectorized = np.vectorize(is_numeric)

# Convert elements in train_data to strings
train_data_strings = np.array([str(item) for item in train_data])

# Check if the elements in train_data are numeric
train_numeric_mask = is_numeric_vectorized(train_data_strings)

# Replace non-numeric values with '0'
train_data_strings[~train_numeric_mask] = '0'

# Convert train_data back to float32
train_data = train_data_strings.astype(np.float32)

# Convert elements in valid_data to strings
valid_data_strings = np.array([str(item) for item in valid_data])

# Check if the elements in valid_data are numeric
valid_numeric_mask = is_numeric_vectorized(valid_data_strings)

# Replace non-numeric values with '0'
valid_data_strings[~valid_numeric_mask] = '0'

# Convert valid_data back to float32
valid_data = valid_data_strings.astype(np.float32)

# Convert elements in test_data to strings
test_data_strings = np.array([str(item) for item in test_data])

# Check if the elements in test_data are numeric
test_numeric_mask = is_numeric_vectorized(test_data_strings)

# Replace non-numeric values with '0'
test_data_strings[~test_numeric_mask] = '0'

# Convert test_data back to float32
test_data = test_data_strings.astype(np.float32)









# Create an HDF5 file and save the NumPy arrays as datasets
with h5py.File('data.h5', 'w') as hf:
    hf.create_dataset('train_data', data=train_data)
    hf.create_dataset('valid_data', data=valid_data)
    hf.create_dataset('test_data', data=test_data)


# Create an HDF5 file and save the NumPy arrays as datasets
with h5py.File('data.h5', 'w') as hf:
    hf.create_dataset('train_data', data=train_data)
    hf.create_dataset('valid_data', data=valid_data)
    hf.create_dataset('test_data', data=test_data)

# New Section

Code Summary:

This code prepares and saves text data for training a machine learning model using a GPT-2 tokenizer and HDF5 file format.

The key steps include:

Loading a preprocessed dataset from a JSON file.

Defining a GPT-2 tokenizer and adding a padding token.

Splitting the dataset into training, validation, and testing sets.

Tokenizing and encoding text data, returning it as NumPy arrays.

Encoding target labels using LabelEncoder.

Saving the label encoder for future use.

Converting Pandas DataFrames to NumPy arrays.

Creating an HDF5 file and saving the NumPy arrays as datasets.

This code streamlines data preprocessing and storage for model training.






In [None]:
pip install transformers




In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import h5py

In [None]:

# Open the HDF5 file in read mode
with h5py.File('data.h5', 'r') as hf:
    # Print the keys (top-level groups) in the HDF5 file
    print("Keys: ", list(hf.keys()))



Keys:  []


In [None]:
# Define hyperparameters
batch_size = 16  # You can set this to your desired batch size
epochs = 3
learning_rate = 2e-5

# Assuming you have a DataFrame 'data' with a column 'intent' that contains intent labels
num_classes = len(data['intent'].unique())

# ... (rest of your code)

# Create datasets
train_dataset = IntentDataset(train_data_list, train_labels_encoded.tolist())
valid_dataset = IntentDataset(valid_data_list, valid_labels_encoded.tolist())

# Define batch size and create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

# Assuming valid_data is a list of dictionaries or numpy arrays
valid_input_ids = []
valid_labels = []

for item in valid_data:
    if isinstance(item, dict) and 'valid_input_ids' in item and 'valid_labels' in item:
        valid_input_ids.append(item['valid_input_ids'])
        valid_labels.append(item['valid_labels'])
    elif isinstance(item, np.ndarray) and item.shape == (2,):  # Assuming the shape of valid_data elements is (2,)
        valid_input_ids.append(item[0])
        valid_labels.append(item[1])
    else:
        # Handle the case where valid_data element does not have the expected structure
        print("Invalid element found in valid_data:", item)

# Now valid_input_ids and valid_labels contain the data if the keys are present in the dictionaries or the shape is as expected

import numpy as np
import os
print(os.getcwd())


import pandas as pd

# Load CSV using Pandas
df = pd.read_csv('/Intentrecognition.csv', delimiter=',', encoding='utf-8-sig')

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()
df['input_ids']

# Encode the non-numeric column
df['column_name'] = label_encoder.fit_transform(df['column_name'])



# Verify the shape of the loaded data (it should be 2D)
print(test_data.shape)

# Manually create test_data as a list of tuples (input_ids, labels)
test_data = [(input_ids_1, labels_1), (input_ids_2, labels_2), ...]

# Access input_ids and labels separately
test_input_ids = [item[0] for item in test_data]
test_labels = [item[1] for item in test_data]


class IntentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Check if 'input_ids' key exists in self.data[idx]
        if 'input_ids' in self.data[idx]:
            input_ids = torch.tensor(self.data[idx]['input_ids'], dtype=torch.long)
        else:
            # Handle the case where 'input_ids' key is missing or provide a default value
            input_ids = torch.zeros(1, dtype=torch.long)  # Replace with appropriate default value or handling
        labels = torch.tensor(self.labels[idx], dtype=torch.long)
        return {
            'input_ids': input_ids,
            'labels': labels
        }




# Convert your data into a list of dictionaries where each dictionary contains input_ids and labels
train_data_list = [{'train_input_ids': input_ids, 'train_labels': label} for input_ids, label in zip(train_input_ids, train_labels)]
valid_data_list = [{'valid_input_ids': input_ids, 'valid_labels': label} for input_ids, label in zip(valid_input_ids, valid_labels)]

# Create datasets
train_dataset = IntentDataset(train_data_list, train_labels_encoded.tolist())
valid_dataset = IntentDataset(valid_data_list, valid_labels_encoded.tolist())

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

# Calculate class weights for handling imbalanced classes
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

# Define hyperparameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Assuming you have a DataFrame 'data' with a column 'intent' that contains intent labels
num_classes = len(data['intent'].unique())


# Initialize the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_classes)



# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float32))

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}")

# Validation loop
model.eval()
total_valid_loss = 0.0
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_valid_loss += loss.item()
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

avg_valid_loss = total_valid_loss / len(valid_dataloader)
accuracy = correct_predictions / total_predictions

print(f"Validation Loss: {avg_valid_loss:.4f}, Accuracy: {accuracy * 100:.2f}%")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2_intent_model")

Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
Invalid element found in valid_data: 0.0
/content


KeyError: ignored

In [None]:



# Data Cleaning
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    # Join words back into text
    return ' '.join(filtered_words)

data['utterance_cleaned'] = data['utterance'].apply(clean_text)

# Text Tokenization
data['tokens'] = data['utterance_cleaned'].apply(word_tokenize)

# Linguistic Flags
def adapt_response(text, flags):
    # Example logic: Modify the text based on linguistic flags
    if 'P' in flags:
        text = "Thank you for your question."
    if 'Q' in flags:
        text = "Sure thing, ask away!"
    return text

data['bot_response'] = data.apply(lambda row: adapt_response(row['utterance'], row['flags']), axis=1)

# Save preprocessed data
data.to_csv('preprocessed_dataset.csv', index=False)


In [None]:
import re

def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    # Join words back into text
    return ' '.join(filtered_words)


In [None]:
from nltk.stem import PorterStemmer

def clean_text(text):
    # ... (previous cleaning steps)
    # Apply stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # ... (rest of the cleaning steps)


In [None]:
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

def synonym_replacement(tokens):
    new_tokens = []
    for token in tokens:
        if token not in stopwords.words('english'):
            synonyms = wn.synsets(token)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                new_tokens.append(synonym)
            else:
                new_tokens.append(token)
        else:
            new_tokens.append(token)
    return new_tokens

def adapt_response(text, flags):
    if 'L' in flags:
        tokens = word_tokenize(text)
        new_tokens = synonym_replacement(tokens)
        text = ' '.join(new_tokens)
    # ... (other flag-based response adaptations)
    return text


In [None]:
# Save preprocessed data as JSON
data.to_json('preprocessed_dataset.json', orient='records', lines=True)
