In [None]:
import os
import random
import string

def preprocess_query(query):
    query = query.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    query = " ".join(query.split())  # Replace multiple adjacent spaces with a single space
    return query

def read_aol_data(directory):
    queries = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="latin-1") as file:
                for line in file:
                    line = line.strip()
                    if line:
                        parts = line.split("\t")
                        if len(parts) >= 2:
                            query = preprocess_query(parts[1])
                            queries.append(query)
    return queries

def save_queries_to_file(queries, filename):
    with open(filename, "w", encoding="utf-8") as file:
        for query in queries:
            file.write(query + "\n")

def split_dataset(queries, train_ratio, val_ratio):
    random.shuffle(queries)
    total_size = len(queries)
    train_size = int(train_ratio * total_size)
    val_size = int(val_ratio * total_size)
    train_queries = queries[:train_size]
    val_queries = queries[train_size:train_size+val_size]
    test_queries = queries[train_size+val_size:]
    return train_queries, val_queries, test_queries

In [None]:
import torch
import torch.nn as nn

class QueryAutoCompletionModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(QueryAutoCompletionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from data import read_aol_data, save_queries_to_file, split_dataset
from model import QueryAutoCompletionModel

# Set random seed for reproducibility
torch.manual_seed(42)

# Define training parameters
data_directory = "/work/aol_data"
train_file = "train.txt"
val_file = "val.txt"
test_file = "test.txt"
embedding_size = 100
hidden_size = 200
batch_size = 64
num_epochs = 10
learning_rate = 0.001

# Read AOL data and preprocess queries
queries = read_aol_data(data_directory)

# Split dataset into train, validation, and test sets
train_queries, val_queries, test_queries = split_dataset(queries, 0.7, 0.15)

# Save queries to files
save_queries_to_file(train_queries, train_file)
save_queries_to_file(val_queries, val_file)
save_queries_to_file(test_queries, test_file)

# Create a dictionary of tokens
all_queries = train_queries + val_queries + test_queries
vocab = sorted(set(" ".join(all_queries).split()))
word_to_idx = {word: idx+1 for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert queries to numerical sequences
train_sequences = [[word_to_idx[word] for word in query.split()] for query in train_queries]
val_sequences = [[word_to_idx[word] for word in query.split()] for query in val_queries]
test_sequences = [[word_to_idx[word] for word in query.split()] for query in test_queries]

# Pad sequences to have the same length
max_seq_length = max(max(len(seq) for seq in train_sequences),
                     max(len(seq) for seq in val_sequences),
                     max(len(seq) for seq in test_sequences))
train_data = [seq + [0] * (max_seq_length - len(seq)) for seq in train_sequences]
val_data = [seq + [0] * (max_seq_length - len(seq)) for seq in val_sequences]
test_data = [seq + [0] * (max_seq_length - len(seq)) for seq in test_sequences]

# Convert data to PyTorch tensors
train_tensor = torch.LongTensor(train_data)
val_tensor = torch.LongTensor(val_data)
test_tensor = torch.LongTensor(test_data)

# Create data loaders
train_loader = DataLoader(train_tensor, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_tensor, batch_size=batch_size)
test_loader = DataLoader(test_tensor, batch_size=batch_size)

# Initialize the model
vocab_size = len(vocab) + 1  # +1 for padding token
model = QueryAutoCompletionModel(vocab_size, embedding_size, hidden_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        # Forward pass
        output = model(batch)
        loss = criterion(output[:, :-1].reshape(-1, vocab_size), batch[:, 1:].reshape(-1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")

# Evaluation
model.eval()
total_loss = 0.0

for batch in val_loader:
    with torch.no_grad():
        output = model(batch)
        loss = criterion(output[:, :-1].reshape(-1, vocab_size), batch[:, 1:].reshape(-1))
        total_loss += loss.item()

average_loss = total_loss / len(val_loader)
print(f"Validation Loss: {average_loss:.4f}")

# Save the best model
best_model_file = "best_model.pt"
torch.save(model.state_dict(), best_model_file)

# Load the best model
best_model = QueryAutoCompletionModel(vocab_size, embedding_size, hidden_size)
best_model.load_state_dict(torch.load(best_model_file))

# Evaluation on test set
best_model.eval()
total_loss = 0.0

for batch in test_loader:
    with torch.no_grad():
        output = best_model(batch)
        loss = criterion(output[:, :-1].reshape(-1, vocab_size), batch[:, 1:].reshape(-1))
        total_loss += loss.item()

average_loss = total_loss / len(test_loader)
perplexity = torch.exp(torch.tensor(average_loss))
print(f"Test Loss: {average_loss:.4f}, Perplexity: {perplexity:.4f}")

In [6]:
filename = "/work/aol_data/user-ct-test-collection-01.txt"

if filename.endswith(".txt"):
    with open(filename, "r") as file:
        content = file.read()
        print("The file is a text file.")
        print("File content:")
        print(content)
else:
    print("The file is not a text file.")


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b5db289b-3e20-4ae8-859e-975fa94ed9a8' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>