In [1]:
import os
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [33]:
#word tokenization, excluding punctuation
def spacy_word_tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct]

def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

def calculate_metrics(df):
    # Total turn count
    total_turn_count = len(df)

    # Tokenize the text and calculate sentence count
    df['sent_count'] = df['userInput'].apply(lambda x: len(spacy_sent_tokenize(x['text'])))

    # Total sentence count
    total_sent_count = df['sent_count'].sum()

    # Mean sentence count per turn
    mean_sent_count = round(df['sent_count'].mean(), 3)

    # Standard deviation of sentence count per turn
    std_sent_count = round(df['sent_count'].std(), 3)

    # Tokenize the text and calculate word count per turn
    df['word_count'] = df['userInput'].apply(lambda x: len(spacy_word_tokenize(x['text'])))
    #get sum of words in the whole dataframe
    total_word_count = df['word_count'].sum()
    # Calculate mean word count as average number of words (tokens) per turn
    mean_word_count = round(df['word_count'].mean(), 3)

    # Calculate standard deviation of word count per turn
    std_word_count = round(df['word_count'].std(), 3)

    # Calculate vocabulary size (unique words)
    vocab_list = []
    for text in df['userInput']:
        vocab_list.extend(spacy_word_tokenize(text['text']))
    vocab_size = len(set(vocab_list))

    # Calculate vocabulary size excluding stopwords
    stop_words = set(stopwords.words('english'))
    vocab_size_no_stopwords = len(set(vocab_list) - stop_words)

    return {
        'total_turn_count': total_turn_count,
        'total_sent_count': total_sent_count,
        'mean_sent_count': mean_sent_count,
        'std_sent_count': std_sent_count,
        'total_word_count': total_word_count,
        'mean_word_count': mean_word_count,
        'std_word_count': std_word_count,
        'vocab_size': vocab_size,
        'vocab_size_no_stopwords': vocab_size_no_stopwords
    }


In [34]:
restaurants = 'C:/Users/kleop/Documents/repos/Exercises/My_chatbot/hw1/span_extraction/restaurant8k/train_0.json'
buses = 'C:/Users/kleop/Documents/repos/Exercises/My_chatbot/hw1/span_extraction/dstc8/Buses_1/train_0.json'
events = 'C:/Users/kleop/Documents/repos/Exercises/My_chatbot/hw1/span_extraction/dstc8/Events_1/train_0.json'
homes = 'C:/Users/kleop/Documents/repos/Exercises/My_chatbot/hw1/span_extraction/dstc8/Homes_1/train_0.json'
car_rentals = 'C:/Users/kleop/Documents/repos/Exercises/My_chatbot/hw1/span_extraction/dstc8/RentalCars_1/train_0.json'
directories = [restaurants, buses, events, homes, car_rentals]

dataframes = {}
for directory in directories:
    if os.path.exists(directory):
        df = pd.read_json(directory)

        category = os.path.basename(os.path.dirname(directory))

        dataframes[category] = df
    else:
        print(f"{directory} does not exist.")

def extract_metrics(dataframes):
    metrics = {}
    for category, df in dataframes.items():
        metrics[category] = calculate_metrics(df)
    return metrics

metrics = extract_metrics(dataframes)

In [35]:
metrics

{'restaurant8k': {'total_turn_count': 8198,
  'total_sent_count': 8783,
  'mean_sent_count': 1.071,
  'std_sent_count': 0.283,
  'total_word_count': 62330,
  'mean_word_count': 7.603,
  'std_word_count': 4.738,
  'vocab_size': 4426,
  'vocab_size_no_stopwords': 4314},
 'Buses_1': {'total_turn_count': 1133,
  'total_sent_count': 1430,
  'mean_sent_count': 1.262,
  'std_sent_count': 0.491,
  'total_word_count': 9694,
  'mean_word_count': 8.556,
  'std_word_count': 4.459,
  'vocab_size': 501,
  'vocab_size_no_stopwords': 429},
 'Events_1': {'total_turn_count': 1498,
  'total_sent_count': 1911,
  'mean_sent_count': 1.276,
  'std_sent_count': 0.51,
  'total_word_count': 12209,
  'mean_word_count': 8.15,
  'std_word_count': 4.645,
  'vocab_size': 773,
  'vocab_size_no_stopwords': 692},
 'Homes_1': {'total_turn_count': 2064,
  'total_sent_count': 2621,
  'mean_sent_count': 1.27,
  'std_sent_count': 0.505,
  'total_word_count': 16701,
  'mean_word_count': 8.092,
  'std_word_count': 4.362,
  'v

In [16]:
dfs = []
for directory in directories:
    if os.path.exists(directory):
        df = pd.read_json(directory)
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_metrics = calculate_metrics(combined_df)

In [17]:
combined_metrics

{'total_turn_count': 13767,
 'total_sent_count': 15848,
 'mean_sent_count': 1.151,
 'std_sent_count': 0.398,
 'total_word_count': 108577,
 'mean_word_count': 7.887,
 'std_word_count': 4.74,
 'vocab_size': 5202,
 'vocab_size_no_stopwords': 5082}

In [57]:
unique_slot_info = {}
total_unique_slots = set()

for category, df in dataframes.items():
    if 'labels' in df.columns:
        # Extract unique slot values from the 'labels' field
        unique_values = df['labels'].apply(lambda x: [label['slot'] for label in x if 'slot' in label] if isinstance(x, list) else []).explode()
        unique_values = unique_values.dropna()  # Drop NaN values
        unique_slots = unique_values.unique()
        unique_slot_counts = len(unique_slots)

        unique_slot_info[category] = {'unique_slots': unique_slots, 'count': unique_slot_counts}
        total_unique_slots.update(unique_slots)

# Print unique slots and their counts for each category
for category, info in unique_slot_info.items():
    print(f"Unique slots and their counts in {category} dataset:")
    print("Unique Slots:", info['unique_slots'])
    print("Count:", info['count'])

# Total number of unique slots across all datasets
total_unique_slots_count = len(total_unique_slots)
print(f"Total number of unique slots across all datasets: {total_unique_slots_count}")


Unique slots and their counts in restaurant8k dataset:
Unique Slots: ['people' 'date' 'time' 'first_name' 'last_name']
Count: 5
Unique slots and their counts in Buses_1 dataset:
Unique Slots: ['to_location' 'leaving_date' 'from_location']
Count: 3
Unique slots and their counts in Events_1 dataset:
Unique Slots: ['city_of_event' 'subcategory' 'date' 'event_name']
Count: 4
Unique slots and their counts in Homes_1 dataset:
Unique Slots: ['area' 'visit_date']
Count: 2
Unique slots and their counts in RentalCars_1 dataset:
Unique Slots: ['dropoff_date' 'pickup_time' 'pickup_city' 'pickup_date']
Count: 4
Total number of unique slots across all datasets: 17
