In [79]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextGenerationPipeline, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
import os

In [3]:
def preprocess_chat_data(chat_data):

    preprocessed_data = []
    for message in chat_data:
        cleaned_message = clean_text(message)    
        tokens = tokenize_message(cleaned_message)
        preprocessed_data.append(tokens)
    
    return preprocessed_data

In [4]:
def tokenize_message(message):
    tokens = word_tokenize(message)
    return tokens

In [5]:
def clean_text(message):
    cleaned_message = re.sub(r'http\S+|www\S+|https\S+', '', message)  # Remove URLs
    cleaned_message = re.sub(r'[^\w\s]|_', '', cleaned_message)  # Remove special characters except spaces and underscores
    cleaned_message = re.sub(r'\s+', ' ', cleaned_message)  # Remove extra whitespaces
    cleaned_message = re.sub(r'\d{6} \d{4}', '', cleaned_message)  # Remove timestamp (e.g., 040221 1003)
    cleaned_message = re.sub(r'\d{6}', '', cleaned_message)  # Remove timestamp without user identifier (e.g., 040221)
    cleaned_message = re.sub(r'<USER_IDENTIFIER>', '', cleaned_message)  # Remove user identifier
    

    return cleaned_message

In [6]:
file_path = "C:\\Users\\HP\\Desktop\\chatting.txt" 
with open(file_path, 'r', encoding='utf-8') as file:
    chat_data = file.readlines()

In [7]:
preprocessed_data = preprocess_chat_data(chat_data)

In [8]:
for message_tokens in preprocessed_data:
    print(message_tokens)

[]
['Manveen', 'Bro']
['Manveen', 'Manveen', 'here']
['Manveen', 'Bro']
['Manveen', 'Bro']
['Manveen', 'Bro']
['Manveen', 'Gayab', 'ho', 'gayi']
['Manveen', 'TEXT', 'KAR', 'DIYA', 'KARO', 'KABHI', 'KABHI']
['Raju', 'Yooo']
['Raju', 'Gayab', 'nahi', 'hui', 'main']
['Raju', 'Idhar', 'ich', 'hoon']
['Raju', 'Tuh', 'bata', 'kaisa', 'chalra', 'prep']
['Manveen', 'Broooo']
['Manveen', 'Kya', 'batau', 'yaar']
['Manveen', 'I', 'got', 'into', 'a', 'college', 'bro']
['Manveen', 'GRIET']
['Raju', 'Wtf']
['Raju', 'Hain']
['Manveen', 'Eamcet', 'bro']
['Raju', 'Batayi', 'nahi', 'tuh']
['Manveen', 'Arre', 'baap']
['Manveen', 'Ghar', 'ka', 'chutiya']
['Raju', 'Toh', 'kaisa', 'hain', 'college']
['Manveen', 'Theek', 'thaak', 'lol']
['Manveen', 'Aaj', 'call']
['Manveen', 'Sham', 'ko']
['Raju', 'haan', 'tonight']
['Raju', 'Yeah', 'yeah']
['Manveen', 'Sure', 'ok', 'cool']
['Raju', 'Main', 'toh', 'mar', 'rahi', 'hoon']
['Raju']
['Manveen', 'Broo']
['Manveen', 'Kaisi', 'hai', 'bro']
['Manveen', '2', 'months'

In [9]:
user1 = 'Manveen'
user2 = 'Raju'

user1_messages = []
user2_messages = []

current_user = None
current_messages = []

for chat in preprocessed_data:
    if len(chat) >= 2:
        user = chat[0]
        message = chat[1:]
        if user == user1:
            if current_user != user1:
                if current_messages:
                    user2_messages.append(current_messages)
                    current_messages = []
                current_user = user1
        elif user == user2:
            if current_user != user2:
                if current_messages:
                    user1_messages.append(current_messages)
                    current_messages = []
                current_user = user2

        current_messages.extend(message)

# Append the last set of messages
if current_user == user1 and current_messages:
    user1_messages.append(current_messages)
elif current_user == user2 and current_messages:
    user2_messages.append(current_messages)

# Print the messages
print("User 1 messages:")
for messages in user1_messages:
    print(messages)

print("User 2 messages:")
for messages in user2_messages:
    print(messages)

User 1 messages:
['Bro', 'Manveen', 'here', 'Bro', 'Bro', 'Bro', 'Gayab', 'ho', 'gayi', 'TEXT', 'KAR', 'DIYA', 'KARO', 'KABHI', 'KABHI']
['Broooo', 'Kya', 'batau', 'yaar', 'I', 'got', 'into', 'a', 'college', 'bro', 'GRIET']
['Eamcet', 'bro']
['Arre', 'baap', 'Ghar', 'ka', 'chutiya']
['Theek', 'thaak', 'lol', 'Aaj', 'call', 'Sham', 'ko']
['Sure', 'ok', 'cool']
['Broo', 'Kaisi', 'hai', 'bro', '2', 'months', 'ka', 'update', 'de']
['Uff', 'yaar']
['Hows', 'your', 'boyfriend']
['Ohhh', 'PHONE', 'PE']
['Gand', 'marane', 'Lol', 'Koi', 'baat', 'nai', 'kaara']
['Hehe', 'Wait', 'Kisko']
['Brooo', 'She', 'doesnt', 'tell', 'me', 'shit']
['Bro', 'Lamba', 'story', 'Oh', 'fuck', 'Class', 'started']
['No', 're', 'lol']
['I', 'WILL', 'Can', 'I', 'call', 'at', '5', 'ish']
['I', 'miss', 'u', 'so', 'much', 'omg']
['WhatsApp', 'call', 'balance', 'nahi', 'hai', 'bhai']
['Sorry', 'broooo', 'I', 'got', 'busy', 'with', 'assignments', 'Ill', 'call', 'tomorrow', '100', 'PERCET', 'Percent']
['Love', 'you', 'Achha

In [10]:
combined_messages = []
for i in range(len(user1_messages)):
    user1_message = user1_messages[i]
    user2_message = user2_messages[i]
    combined_messages.append((user1_message, user2_message))



In [11]:
for pair in combined_messages:
    user1_message = ' '.join(pair[0])
    user2_message = ' '.join(pair[1])
    print(f"User 1: {user1_message}")
    print(f"User 2: {user2_message}")
    print()

User 1: Bro Manveen here Bro Bro Bro Gayab ho gayi TEXT KAR DIYA KARO KABHI KABHI
User 2: Yooo Gayab nahi hui main Idhar ich hoon Tuh bata kaisa chalra prep

User 1: Broooo Kya batau yaar I got into a college bro GRIET
User 2: Wtf Hain

User 1: Eamcet bro
User 2: Batayi nahi tuh

User 1: Arre baap Ghar ka chutiya
User 2: Toh kaisa hain college

User 1: Theek thaak lol Aaj call Sham ko
User 2: haan tonight Yeah yeah

User 1: Sure ok cool
User 2: Main toh mar rahi hoon

User 1: Broo Kaisi hai bro 2 months ka update de
User 2: Padhai padhai padhai literally

User 1: Uff yaar
User 2: Bro mock mein mere marks ache aane lage so I got full happy so I stopped studying for a week Its like one roller coaster bro Then I joined the gym

User 1: Hows your boyfriend
User 2: Tera kaisa scene hain We met once in Jan thats all

User 1: Ohhh PHONE PE
User 2: Aur sab log kaise hain Hamare friends Asita Lallu

User 1: Gand marane Lol Koi baat nai kaara
User 2: Same bro Isiliye new years I texted

User 1: 

In [12]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [13]:
train_data = []
for user1_messages, user2_messages in zip(user1_messages, user2_messages):
    conversation = " ".join(user1_messages) + " " + " ".join(user2_messages)
    train_data.append(conversation)

In [14]:
input_ids = []
attention_masks = []
for conversation in train_data:
    encoded_inputs = tokenizer.encode_plus(conversation, add_special_tokens=True, truncation=True)
    input_ids.append(encoded_inputs["input_ids"])
    attention_masks.append(encoded_inputs["attention_mask"])

In [15]:
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]
attention_masks = [masks + [0] * (max_length - len(masks)) for masks in attention_masks]

In [16]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
batch_size = 4
num_epochs = 3
learning_rate = 1e-4

# Set up the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [40]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=file_path,
    block_size=128  # Adjust the block size as needed
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=learning_rate,
    logging_steps=500,
    save_steps=1000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

trainer.train()




Step,Training Loss
500,1.7398
1000,1.6145
1500,1.4982
2000,1.4156
2500,1.3942
3000,1.2973
3500,1.2947


TrainOutput(global_step=3591, training_loss=1.46061345363055, metrics={'train_runtime': 14469.2314, 'train_samples_per_second': 0.993, 'train_steps_per_second': 0.248, 'total_flos': 938103717888000.0, 'train_loss': 1.46061345363055, 'epoch': 3.0})

In [41]:
output_dir = os.path.join(os.path.expanduser("~"), "Desktop", "trained_model")
os.makedirs(output_dir, exist_ok=True)

# Save the trained model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('C:\\Users\\HP\\Desktop\\trained_model\\tokenizer_config.json',
 'C:\\Users\\HP\\Desktop\\trained_model\\special_tokens_map.json',
 'C:\\Users\\HP\\Desktop\\trained_model\\vocab.json',
 'C:\\Users\\HP\\Desktop\\trained_model\\merges.txt',
 'C:\\Users\\HP\\Desktop\\trained_model\\added_tokens.json')

In [42]:
# Load the trained model
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

# Set the model to evaluation mode
model.eval()





GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [77]:
def generate_response(user_input):
    input_ids = tokenizer.encode(user_input, add_special_tokens=True, return_tensors="pt").to(device)
    attention_mask = torch.ones_like(input_ids).to(device)

    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=30,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,  # Adjust the temperature value
        num_beams=30,  # Adjust the number of beams for beam search
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [55]:
# Generate a response given a user input
user_input = "Hello, how are you?"
input_ids = tokenizer.encode(user_input, add_special_tokens=True, return_tensors="pt").to(device)
attention_mask = torch.ones_like(input_ids).to(device)

# Generate the output with attention mask
output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=23, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Model response:", response)

Model response: Hello, how are you?
02/10/22, 01:04 - Raju: I'm fine


In [78]:
print("Model initialized. Enter 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break

    response = generate_response(user_input)
    print("Model: " + response)

Model initialized. Enter 'exit' to quit.
You: exit
