In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Data Loading

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("/content/drive/MyDrive/topical_chat.csv")

In [None]:
df.head()

#Data Preprocessing

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Convert the 'message' column to lowercase
df['message'] = df['message'].str.lower()

# Tokenization
df['message'] = df['message'].apply(nltk.word_tokenize)

# Punctuation Removal and Stop Word Removal
stop_words = set(stopwords.words('english'))
df['message'] = df['message'].apply(lambda tokens: [word for word in tokens if word.isalnum() and word not in stop_words])

# Stemming (or Lemmatization if you prefer)
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

# Encode Sentiments into Numerical Values
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

# Save the preprocessed dataset to a new CSV file if needed
df.to_csv('preprocessed_alexa_dataset.csv', index=False)

# Now, 'df' contains the preprocessed data with sentiments encoded numerically
# You can use this DataFrame for further analysis or modeling


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Text Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load the preprocessed dataset (replace 'preprocessed_alexa_dataset.csv' with your dataset)
df = pd.read_csv('/content/preprocessed_alexa_dataset.csv')

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Adjust the number of features as needed
    ngram_range=(1, 2),  # Use unigrams and bigrams
    stop_words='english',  # Remove English stop words
)

# Fit and transform the vectorizer on the preprocessed text data
X_tfidf = tfidf_vectorizer.fit_transform(df['message'])

# Now, 'X_tfidf' contains the TF-IDF vectors of your preprocessed text data
# You can use these vectors for modeling or analysis


#Data Splitting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset (replace 'preprocessed_alexa_dataset.csv' with your dataset)
df = pd.read_csv('preprocessed_alexa_dataset.csv')

# Identify unique conversation IDs
unique_conversation_ids = df['conversation_id'].unique()

# Shuffle the conversation IDs randomly
import random
random.shuffle(unique_conversation_ids)

# Split the conversation IDs into training, validation, and test sets
train_ratio = 0.7  # Adjust the ratios as needed
val_ratio = 0.15
test_ratio = 0.15

num_conversations = len(unique_conversation_ids)
num_train = int(train_ratio * num_conversations)
num_val = int(val_ratio * num_conversations)

train_conversations = unique_conversation_ids[:num_train]
val_conversations = unique_conversation_ids[num_train:num_train + num_val]
test_conversations = unique_conversation_ids[num_train + num_val:]

# Extract data samples for each set based on the conversation IDs
train_data = df[df['conversation_id'].isin(train_conversations)]
val_data = df[df['conversation_id'].isin(val_conversations)]
test_data = df[df['conversation_id'].isin(test_conversations)]

# Optionally, reset the index of the extracted datasets
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Now you have three separate datasets: train_data, val_data, and test_data,
# where conversations are not split across sets


#Model Training

In [None]:
pip install transformers


Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


In [None]:
df.head()

Unnamed: 0,conversation_id,message,sentiment,sentiment_encoded
0,1,"['fan', 'googl', 'microsoft']",Curious to dive deeper,1
1,1,"['excel', 'technolog', 'help', 'mani', 'way', ...",Curious to dive deeper,1
2,1,"['huge', 'fan', 'googl', 'use', 'lot', 'think'...",Curious to dive deeper,1
3,1,"['googl', 'provid', 'onlin', 'relat', 'servic'...",Curious to dive deeper,1
4,1,"['yeah', 'servic', 'good', 'fan', 'intrus', 'p...",Curious to dive deeper,1


In [None]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text and create attention masks
encoded_data = tokenizer.batch_encode_plus(
    df['message'].values,
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=64,  # Adjust this based on your dataset and hardware capabilities
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels = torch.tensor(df['sentiment_encoded'].values)


In [None]:
import tensorflow as tf

# Define your model
model = tf.keras.Sequential([
    # Add layers here
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define your training and testing data (e.g., training_padded, training_labels, testing_padded, testing_labels)

# Train the model
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
import tensorflow as tf

# Define your model
model = tf.keras.Sequential([
    # Add layers here
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define your training and testing data (e.g., training_padded, training_labels, testing_padded, testing_labels)

# Train the model
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)


In [None]:
num_classes = 8  # Replace with the actual number of classes in your dataset

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)




In [None]:
criterion = torch.nn.CrossEntropyLoss()


In [None]:
num_epochs = 5  # Adjust this based on your dataset size and training goals

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids_batch, attention_mask_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        labels_batch = labels_batch.to(device)

        outputs = model(input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Compute predicted labels
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(predicted_labels)

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Usage example:
evaluation_results = evaluate_model(model, val_loader, device)
print("Evaluation Results:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")


Evaluation Results:
Accuracy: 0.4629
Precision: 0.4149
Recall: 0.4629
F1 Score: 0.3956


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoder = BertModel.from_pretrained('bert-base-uncased')

# Define a function to encode user questions
def encode_user_query(user_query):
    # Tokenize and encode the user query
    tokens = tokenizer.tokenize(user_query)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Convert the input_ids to a tensor
    input_ids = torch.tensor(input_ids)

    # Encode the user query using the BERT model
    with torch.no_grad():
        encoder_output = encoder(input_ids.unsqueeze(0))  # Unsqueeze to add batch dimension

    # Extract the last-layer hidden states (CLS token representation)
    cls_embedding = encoder_output.last_hidden_state[:, 0, :]  # Shape: [1, hidden_dim]

    return cls_embedding

# Example usage
user_query = "How are you?"
encoded_query = encode_user_query(user_query)


In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate responses
def generate_response(input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
user_input = "How are you?"
response = generate_response(user_input)
print("Chatbot:", response)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Chatbot: how are you? hey.... hey......................................


In [None]:
def generate_response(user_input):
    model.eval()
    input_ids = tokenizer.encode(user_input, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


In [None]:
# Assuming you have already loaded and fine-tuned your model and tokenizer
# Example user inputs
user_inputs = ["Hi there!", "How does this work?", "Tell me a joke."]

for user_input in user_inputs:
    response = generate_response(user_input, model, tokenizer)
    print("User:", user_input)
    print("Chatbot:", response)
    print()
