In [21]:
import json
from statistics import mean, median
from collections import Counter


def analyze_json_strings(json_file_path):
    # Read JSON file
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    # Extract values and calculate lengths
    values = list(data.values())
    lengths = [len(v.strip()) for v in values]
    #lengths = [len(v.strip()) for v in values if len(v.strip()) > 50]

    # Calculate statistics
    min_length = min(lengths)
    max_length = max(lengths)
    median_length = median(lengths)
    mean_length = mean(lengths)
    
    # Calculate mean number of newlines
    newlines = [v.count('\n') for v in (v.strip() for v in values)]
    mean_newlines = mean(newlines)
    
    # Find most common first and last characters
    first_chars = [v.strip()[0] for v in values if v]
    last_chars = [v.strip()[-1] for v in values if v]
    
    first_char_counts = Counter(first_chars).most_common(5)
    last_char_counts = Counter(last_chars).most_common(5)
    
    total_values = len(values)
    first_char_counts = Counter(first_chars).most_common(5)
    last_char_counts = Counter(last_chars).most_common(5)
    
    # Print results
    print("String Analysis Results:")
    print(f"Minimum length: {min_length}")
    print(f"Maximum length: {max_length}")
    print(f"Median length: {median_length:.2f}")
    print(f"Mean length: {mean_length:.2f}")
    print(f"Mean newlines per value: {mean_newlines:.2f}")
    
    print("\nTop 5 first characters:")
    for char, count in first_char_counts:
        percentage = (count / total_values) * 100
        print(f"  '{char}': {percentage:.1f}%")
    
    print("\nTop 5 last characters:")
    for char, count in last_char_counts:
        percentage = (count / total_values) * 100
        print(f"  '{char}': {percentage:.1f}%")

file_path = '/Users/christopherackerman/repos/spar_self_recognition/summaries/xsum_train_human_filteredlen_responses.json'
analyze_json_strings(file_path)


String Analysis Results:
Minimum length: 112
Maximum length: 290
Median length: 152.00
Mean length: 161.86
Mean newlines per value: 0.00

Top 5 first characters:
  'A': 18.8%
  'T': 15.5%
  'F': 7.2%
  'S': 5.8%
  'M': 5.8%

Top 5 last characters:
  '.': 100.0%


In [22]:
import json
from statistics import mean, median
from collections import Counter


def analyze_json_strings(json_file_path):
    # Read JSON file
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    # Extract values and calculate lengths
    values = list(data.values())
    lengths = [len(v.strip()) for v in values]# if len(v.strip()) > 50]
    
    # Calculate statistics
    min_length = min(lengths)
    max_length = max(lengths)
    median_length = median(lengths)
    mean_length = mean(lengths)
    
    # Calculate mean number of newlines
    newlines = [v.count('\n') for v in (v.strip() for v in values)]
    mean_newlines = mean(newlines)
    
    # Find most common first and last characters
    first_chars = [v.strip()[0] for v in values if v]
    last_chars = [v.strip()[-1] for v in values if v]
    
    first_char_counts = Counter(first_chars).most_common(5)
    last_char_counts = Counter(last_chars).most_common(5)
    
    total_values = len(values)
    first_char_counts = Counter(first_chars).most_common(5)
    last_char_counts = Counter(last_chars).most_common(5)
    
    # Print results
    print("String Analysis Results:")
    print(f"Minimum length: {min_length}")
    print(f"Maximum length: {max_length}")
    print(f"Median length: {median_length:.2f}")
    print(f"Mean length: {mean_length:.2f}")
    print(f"Mean newlines per value: {mean_newlines:.2f}")
    
    print("\nTop 5 first characters:")
    for char, count in first_char_counts:
        percentage = (count / total_values) * 100
        print(f"  '{char}': {percentage:.1f}%")
    
    print("\nTop 5 last characters:")
    for char, count in last_char_counts:
        percentage = (count / total_values) * 100
        print(f"  '{char}': {percentage:.1f}%")

file_path = '/Users/christopherackerman/repos/spar_self_recognition/summaries/xsum_train_llama3_8bchat_filteredlen_responses.json'
analyze_json_strings(file_path)


String Analysis Results:
Minimum length: 130
Maximum length: 231
Median length: 171.00
Mean length: 173.99
Mean newlines per value: 0.00

Top 5 first characters:
  'A': 19.3%
  'T': 16.4%
  'S': 8.7%
  'B': 6.8%
  'C': 5.8%

Top 5 last characters:
  '.': 100.0%


In [29]:
import json
from statistics import mean, median
from collections import Counter, defaultdict

def analyze_texts(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Group data by source
    sources = defaultdict(list)
    for item in data:
        sources[item['source']].append(item['text'])
    
    # Function to analyze a list of texts
    def analyze_texts_list(texts):
        lengths = [len(t.strip()) for t in texts]
        newlines = [t.count('\n') for t in (t.strip() for t in texts)]
        first_chars = [t.strip()[0] for t in texts if t]
        last_chars = [t.strip()[-1] for t in texts if t]
        
        return {
            'count': len(texts),
            'min_length': min(lengths),
            'max_length': max(lengths),
            'median_length': median(lengths),
            'mean_length': mean(lengths),
            'mean_newlines': mean(newlines),
            'first_chars': Counter(first_chars).most_common(5),
            'last_chars': Counter(last_chars).most_common(5)
        }
    
    # Analyze each source and total
    results = {source: analyze_texts_list(texts) for source, texts in sources.items()}
    results['total'] = analyze_texts_list([item['text'] for item in data])
    
    # Print results
    for source, stats in results.items():
        print(f"\nAnalysis for source: {source.upper()}")
        print(f"Total count: {stats['count']}")
        print(f"Minimum length: {stats['min_length']}")
        print(f"Maximum length: {stats['max_length']}")
        print(f"Median length: {stats['median_length']:.2f}")
        print(f"Mean length: {stats['mean_length']:.2f}")
        print(f"Mean newlines per text: {stats['mean_newlines']:.2f}")
        
        print("\nTop 5 first characters:")
        for char, count in stats['first_chars']:
            percentage = (count / stats['count']) * 100
            print(f"  '{char}': {percentage:.1f}% ({count})")
        
        print("\nTop 5 last characters:")
        for char, count in stats['last_chars']:
            percentage = (count / stats['count']) * 100
            print(f"  '{char}': {percentage:.1f}% ({count})")
        
        print("\n" + "="*50)

# Example usage
file_path = '/Users/christopherackerman/repos/spar_self_recognition/completions_full/completions_llama3_8bchat_train.json'
analyze_texts(file_path)


Analysis for source: FORUM
Total count: 202
Minimum length: 1
Maximum length: 7346
Median length: 1968.50
Mean length: 2116.76
Mean newlines per text: 12.32

Top 5 first characters:
  '.': 12.4% (25)
  'I': 9.4% (19)
  'A': 8.9% (18)
  't': 6.4% (13)
  'a': 6.4% (13)

Top 5 last characters:
  '.': 86.6% (175)
  '!': 6.9% (14)
  '?': 4.0% (8)
  'a': 1.0% (2)
  's': 0.5% (1)


Analysis for source: ABSTRACTS
Total count: 350
Minimum length: 87
Maximum length: 4746
Median length: 1898.00
Mean length: 1900.70
Mean newlines per text: 6.59

Top 5 first characters:
  '.': 17.1% (60)
  't': 12.3% (43)
  'o': 8.3% (29)
  'a': 7.7% (27)
  ',': 6.0% (21)

Top 5 last characters:
  '.': 100.0% (350)


Analysis for source: WIKIPEDIA
Total count: 93
Minimum length: 293
Maximum length: 3105
Median length: 1362.00
Mean length: 1443.76
Mean newlines per text: 5.04

Top 5 first characters:
  't': 15.1% (14)
  ',': 9.7% (9)
  'a': 9.7% (9)
  'w': 7.5% (7)
  '.': 7.5% (7)

Top 5 last characters:
  '.': 100