In [10]:
import os
import json
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def calculate_review_lengths(directory, token_threshold):
    review_lengths = []
    max_length = 0
    max_length_file = ""
    max_length_line_number = 0
    long_token_reviews_count = 0
    token_counts = []
    
    for dataset_folder in os.listdir(directory):
        dataset_path = os.path.join(directory, dataset_folder)
        
        gen_file_path = os.path.join(dataset_path, "gen.jsonl")
        if os.path.isdir(dataset_path) and os.path.isfile(gen_file_path):
            with open(gen_file_path, 'r', encoding='utf-8') as file:
                print("Processing file: ", gen_file_path)
                for line_number, line in enumerate(file, start=1):
                    if line_number % 10000 == 0:
                        print("line_number: ", line_number)
                    try:
                        data = json.loads(line)
                        if "review" in data:

                            review = data["review"]

                            tokens = tokenizer.encode(review)
                            token_length = len(tokens)
                            token_counts.append(token_length)
                            
                            if token_length > max_length:
                                max_length = token_length
                                max_length_file = gen_file_path
                                max_length_line_number = line_number

                            if token_length > token_threshold:
                                long_token_reviews_count += 1

                    except json.JSONDecodeError:
                        print(f"Error decoding line {line_number} in {gen_file_path}")
                        continue

    if token_counts:
        mean_tokens = sum(token_counts) / len(token_counts)
        return {
            "max_length": max_length,
            "mean_tokens": mean_tokens,
            "total_reviews": len(token_counts),
            "long_token_reviews_count": long_token_reviews_count,
            "max_length_file": max_length_file,
            "max_length_line_number": max_length_line_number
        }
    else:
        return None




In [11]:
# Path to the main data folder
data_directory = 'C:/Users/Hector Auvinen/Documents/GitHub/CG4MCTG/data'

# Threshold for long reviews (in characters)
length_threshold = 256

# Call the function and print results
result = calculate_review_lengths(data_directory, length_threshold)

if result:
    print(f"Maximum review token length: {result['max_length']} GPT-2 tokens")
    print(f"Mean review token length: {result['mean_tokens']} GPT-2 tokens")
    print(f"Total number of reviews: {result['total_reviews']}")
    print(f"Number of reviews longer than {length_threshold} GPT-2 tokens: {result['long_token_reviews_count']}")
    print(f"Max review found in: {result['max_length_file']} at line {result['max_length_line_number']}")

Processing file:  C:/Users/Hector Auvinen/Documents/GitHub/CG4MCTG/data\Amazon\gen.jsonl


Token indices sequence length is longer than the specified maximum sequence length for this model (1964 > 1024). Running this sequence through the model will result in indexing errors


line_number:  1000
line_number:  2000
line_number:  3000
line_number:  4000
line_number:  5000
line_number:  6000
line_number:  7000
line_number:  8000
line_number:  9000
line_number:  10000
line_number:  11000
line_number:  12000
line_number:  13000
line_number:  14000
line_number:  15000
line_number:  16000
line_number:  17000
line_number:  18000
line_number:  19000
line_number:  20000
line_number:  21000
line_number:  22000
line_number:  23000
line_number:  24000
line_number:  25000
line_number:  26000
line_number:  27000
line_number:  28000
line_number:  29000
line_number:  30000
line_number:  31000
line_number:  32000
line_number:  33000
line_number:  34000
line_number:  35000
line_number:  36000
line_number:  37000
line_number:  38000
line_number:  39000
line_number:  40000
line_number:  41000
line_number:  42000
line_number:  43000
line_number:  44000
line_number:  45000
line_number:  46000
line_number:  47000
line_number:  48000
line_number:  49000
line_number:  50000
line_numb