In [None]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
restaurants = '/content/drive/MyDrive/διαλογθε_συστεμσ/span_extraction/restaurant8k/train_0.json'
buses = '/content/drive/MyDrive/διαλογθε_συστεμσ/span_extraction/dstc8/Buses_1/train_0.json'
events = '/content/drive/MyDrive/διαλογθε_συστεμσ/span_extraction/dstc8/Events_1/train_0.json'
homes = '/content/drive/MyDrive/διαλογθε_συστεμσ/span_extraction/dstc8/Homes_1/train_0.json'
car_rentals = '/content/drive/MyDrive/διαλογθε_συστεμσ/span_extraction/dstc8/RentalCars_1/train_0.json'
directories = [restaurants, buses, events, homes, car_rentals]

dataframes = {}

def calculate_metrics(df):
    # Total turn count
    total_turn_count = len(df)

    # Tokenize the text and calculate turn length
    df['turn_length'] = df['userInput'].apply(lambda x: len(x['text'].split()))

    # Tokenize the text and calculate sentence count
    df['sent_count'] = df['userInput'].apply(lambda x: len(nltk.sent_tokenize(x['text'])))

    # Total sentence count
    total_sent_count = df['sent_count'].sum()

    # Mean sentence count
    mean_sent_count = round(df['sent_count'].mean(), 3)

    # Standard deviation of sentence count
    std_sent_count = round(df['sent_count'].std(), 3)

    # Tokenize the text and calculate word count
    df['word_count'] = df['userInput'].apply(lambda x: len(nltk.word_tokenize(x['text'])))

    # Total word count
    total_word_count = df['word_count'].sum()

    # Calculate mean word length as average number of words (tokens) per sentence
    mean_word_length = round(total_word_count / total_sent_count, 3)

    # Calculate standard deviation of word length
    word_lengths = df['userInput'].apply(lambda x: [len(word) for word in nltk.word_tokenize(x['text'])])
    all_word_lengths = [length for sublist in word_lengths for length in sublist]
    std_word_length = round(np.std(all_word_lengths), 3)

    # Calculate vocabulary size (unique words)
    vocab_set = set()
    for text in df['userInput']:
        vocab_set.update(set(nltk.word_tokenize(text['text'])))
    vocab_size = len(vocab_set)

    # Calculate vocabulary size excluding stopwords
    stop_words = set(stopwords.words('english'))
    vocab_size_no_stopwords = len(vocab_set - stop_words)

    return {
        'total_turn_count': total_turn_count,
        'total_sent_count': total_sent_count,
        'mean_sent_count': mean_sent_count,
        'std_sent_count': std_sent_count,
        'total_word_count': total_word_count,
        'mean_word_length': mean_word_length,
        'std_word_length': std_word_length,
        'vocab_size': vocab_size,
        'vocab_size_no_stopwords': vocab_size_no_stopwords
    }

for directory in directories:
    if os.path.exists(directory):
        df = pd.read_json(directory)

        category = os.path.basename(os.path.dirname(directory))

        dataframes[category] = df
    else:
        print(f"{directory} does not exist.")

def extract_metrics(dataframes):
    metrics = {}
    for category, df in dataframes.items():
        metrics[category] = calculate_metrics(df)
    return metrics

metrics = extract_metrics(dataframes)

In [15]:
metrics

{'restaurant8k': {'total_turn_count': 8198,
  'total_sent_count': 8673,
  'mean_sent_count': 1.058,
  'std_sent_count': 0.254,
  'total_word_count': 68637,
  'mean_word_length': 7.914,
  'std_word_length': 2.354,
  'vocab_size': 4484,
  'vocab_size_no_stopwords': 4373},
 'Buses_1': {'total_turn_count': 1133,
  'total_sent_count': 1434,
  'mean_sent_count': 1.266,
  'std_sent_count': 0.495,
  'total_word_count': 11377,
  'mean_word_length': 7.934,
  'std_word_length': 2.074,
  'vocab_size': 513,
  'vocab_size_no_stopwords': 443},
 'Events_1': {'total_turn_count': 1498,
  'total_sent_count': 1906,
  'mean_sent_count': 1.272,
  'std_sent_count': 0.502,
  'total_word_count': 14562,
  'mean_word_length': 7.64,
  'std_word_length': 2.174,
  'vocab_size': 786,
  'vocab_size_no_stopwords': 706},
 'Homes_1': {'total_turn_count': 2064,
  'total_sent_count': 2636,
  'mean_sent_count': 1.277,
  'std_sent_count': 0.509,
  'total_word_count': 19733,
  'mean_word_length': 7.486,
  'std_word_length': 

In [16]:
dfs = []
for directory in directories:
    if os.path.exists(directory):
        df = pd.read_json(directory)
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_metrics = calculate_metrics(combined_df)

In [18]:
combined_metrics

{'total_turn_count': 13767,
 'total_sent_count': 15744,
 'mean_sent_count': 1.144,
 'std_sent_count': 0.387,
 'total_word_count': 123182,
 'mean_word_length': 7.824,
 'std_word_length': 2.281,
 'vocab_size': 5281,
 'vocab_size_no_stopwords': 5161}