# Domain Suggestion LLM - Experimentation

This notebook contains experiments and analysis for the domain suggestion LLM project.

In [None]:
# Import required libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Load the data
with open('../data/train_data.json', 'r') as f:
    train_data = json.load(f)
    
with open('../data/eval_data.json', 'r') as f:
    eval_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Evaluation samples: {len(eval_data)}")

In [None]:
# Analyze the data

# Distribution of number of suggestions per business description
num_suggestions_train = [len(item['suggestions']) for item in train_data]
num_suggestions_eval = [len(item['suggestions']) for item in eval_data]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(num_suggestions_train, bins=range(1, 7), alpha=0.7, color='blue')
plt.title('Training Data: Number of Suggestions per Business')
plt.xlabel('Number of Suggestions')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(num_suggestions_eval, bins=range(1, 7), alpha=0.7, color='green')
plt.title('Evaluation Data: Number of Suggestions per Business')
plt.xlabel('Number of Suggestions')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Average suggestions (train): {sum(num_suggestions_train) / len(num_suggestions_train):.2f}")
print(f"Average suggestions (eval): {sum(num_suggestions_eval) / len(num_suggestions_eval):.2f}")

In [None]:
# Analyze domain extensions
def extract_extension(domain):
    if '.' in domain:
        return '.' + domain.split('.')[-1]
    return 'no_extension'

# Collect all domains
all_domains_train = [suggestion['domain'] for item in train_data for suggestion in item['suggestions']]
all_domains_eval = [suggestion['domain'] for item in eval_data for suggestion in item['suggestions']]

# Extract extensions
extensions_train = [extract_extension(domain) for domain in all_domains_train]
extensions_eval = [extract_extension(domain) for domain in all_domains_eval]

# Count extensions
extension_counts_train = Counter(extensions_train)
extension_counts_eval = Counter(extensions_eval)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(extension_counts_train.keys(), extension_counts_train.values(), color='blue', alpha=0.7)
plt.title('Training Data: Domain Extensions')
plt.xlabel('Extension')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.bar(extension_counts_eval.keys(), extension_counts_eval.values(), color='green', alpha=0.7)
plt.title('Evaluation Data: Domain Extensions')
plt.xlabel('Extension')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Analyze confidence scores
confidence_scores_train = [suggestion['confidence'] for item in train_data for suggestion in item['suggestions']]
confidence_scores_eval = [suggestion['confidence'] for item in eval_data for suggestion in item['suggestions']]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(confidence_scores_train, bins=20, alpha=0.7, color='blue')
plt.title('Training Data: Confidence Score Distribution')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(confidence_scores_eval, bins=20, alpha=0.7, color='green')
plt.title('Evaluation Data: Confidence Score Distribution')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Average confidence (train): {sum(confidence_scores_train) / len(confidence_scores_train):.2f}")
print(f"Average confidence (eval): {sum(confidence_scores_eval) / len(confidence_scores_eval):.2f}")

In [None]:
# Sample some business descriptions
print("Sample Business Descriptions from Training Data:")
for i in range(5):
    print(f"{i+1}. {train_data[i]['business_description']}")
    for j, suggestion in enumerate(train_data[i]['suggestions']):
        print(f"   {j+1}. {suggestion['domain']} (confidence: {suggestion['confidence']})")
    print()