Cleaning our `jsonl` file and saving as a `csv`.

In [5]:
import json
import csv
import os
import re

def clean_text(text):
    """Clean text by removing unusual line terminators and normalizing whitespace"""
    if not isinstance(text, str):
        return text
    
    # Remove Line Separator (LS) and Paragraph Separator (PS) characters
    text = text.replace('\u2028', ' ')  # Line Separator
    text = text.replace('\u2029', ' ')  # Paragraph Separator
    
    # Remove other unusual whitespace characters
    text = text.replace('\u00A0', ' ')  # Non-breaking space
    text = text.replace('\u2000', ' ')  # En quad
    text = text.replace('\u2001', ' ')  # Em quad
    text = text.replace('\u2002', ' ')  # En space
    text = text.replace('\u2003', ' ')  # Em space
    text = text.replace('\u2004', ' ')  # Three-per-em space
    text = text.replace('\u2005', ' ')  # Four-per-em space
    text = text.replace('\u2006', ' ')  # Six-per-em space
    text = text.replace('\u2007', ' ')  # Figure space
    text = text.replace('\u2008', ' ')  # Punctuation space
    text = text.replace('\u2009', ' ')  # Thin space
    text = text.replace('\u200A', ' ')  # Hair space
    text = text.replace('\u200B', '')   # Zero width space
    text = text.replace('\u200C', '')   # Zero width non-joiner
    text = text.replace('\u200D', '')   # Zero width joiner
    text = text.replace('\u2060', '')   # Word joiner
    
    # Normalize multiple spaces to single space
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def convert_jsonl_to_csv():
    input_file = "raw_data/Amazon_Fashion.jsonl"
    output_file = "clean_data/amazon_fashion.csv"
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found!")
        return
    
    # Define the columns we want to extract
    columns = ['rating', 'title', 'text', 'helpful_vote', 'verified_purchase', 'asin', 'user_id']
    
    # Open output CSV file for writing with explicit line terminator
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, lineterminator='\n')
        
        # Write header row
        writer.writerow(columns)
        
        # Process each line in the JSONL file
        with open(input_file, 'r', encoding='utf-8') as jsonlfile:
            line_count = 0
            for line in jsonlfile:
                try:
                    # Parse JSON line
                    data = json.loads(line.strip())
                    
                    # Extract and clean the specified columns
                    row = []
                    for col in columns:
                        value = data.get(col, '')  # Get value or empty string if key doesn't exist
                        
                        # Clean text fields (title and text)
                        if col in ['title', 'text'] and isinstance(value, str):
                            value = clean_text(value)
                        
                        row.append(value)
                    
                    # Write row to CSV
                    writer.writerow(row)
                    line_count += 1
                    
                    # Print progress every 10000 lines, overwriting the previous line
                    if line_count % 10000 == 0:
                        print(f"\rProcessed {line_count} lines...", end='')
                        
                except json.JSONDecodeError as e:
                    print(f"\rError parsing line {line_count + 1}: {e}") # Use \r for errors too, if they occur during progress updates
                    continue
                except Exception as e:
                    print(f"\rError processing line {line_count + 1}: {e}") # Use \r for errors too
                    continue
    
    # Print a final newline to ensure subsequent prints start on a new line
    print(f"\r")
    print(f"Conversion complete! Processed {line_count} lines.")
    print(f"Output saved to: {output_file}")

# Run the conversion
convert_jsonl_to_csv()

Processed 2500000 lines...
Conversion complete! Processed 2500939 lines.
Output saved to: clean_data/amazon_fashion.csv
