In [1]:
import pandas as pd
import csv
import re
from io import StringIO

# Step 8.1: Read the CSV with robust parsing
try:
    data = pd.read_csv('reddit_posts.csv', quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8', on_bad_lines='warn')
except Exception as e:
    print(f"Error reading CSV: {e}")
    # Fallback: Manual parsing
    with open('reddit_posts.csv', 'r', encoding='utf-8') as file:
        raw_data = file.read()
    csv_buffer = StringIO(raw_data)
    data = pd.read_csv(csv_buffer, quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8', on_bad_lines='warn')

# Step 8.2: Text Preprocessing for title and selftext
def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    # Encode to handle emojis and non-ASCII characters
    text = text.encode('utf-8', errors='ignore').decode('utf-8')
    # Replace newlines and carriage returns with spaces
    text = re.sub(r'[\n\r]+', ' ', text)
    # Escape double quotes for CSV compatibility
    text = text.replace('"', '""')
    # Normalize multiple spaces and tabs
    text = re.sub(r'\s+', ' ', text)
    # Simplify URLs and user mentions
    text = re.sub(r'https?://\S+', '[URL]', text)
    text = re.sub(r'u/\S+', '[USER]', text)
    # Trim leading/trailing spaces
    return text.strip()

# Apply cleaning to title and selftext
data['title'] = data['title'].apply(clean_text)
data['selftext'] = data['selftext'].apply(clean_text)

# Step 8.3: Validate and clean other columns
# Convert created_utc to datetime
data['created_utc'] = pd.to_datetime(data['created_utc'], errors='coerce')

# Drop rows with missing post_id
data = data.dropna(subset=['post_id'])

# Check for duplicates
print("Duplicate post_ids:", data['post_id'].duplicated().sum())

# Step 8.4: Basic validation
print("\nDataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())
print("\nRow Count:", len(data))

# Step 8.5: Save the corrected CSV
data.to_csv('reddit_posts_fixed.csv', index=False, quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8')
print("\nFixed CSV saved to 'reddit_posts_fixed.csv'")

Duplicate post_ids: 0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   post_id       500 non-null    object        
 1   title         500 non-null    object        
 2   author        496 non-null    object        
 3   score         500 non-null    int64         
 4   created_utc   500 non-null    datetime64[ns]
 5   num_comments  500 non-null    int64         
 6   selftext      500 non-null    object        
 7   url           500 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 31.4+ KB
None

Missing Values:
post_id         0
title           0
author          4
score           0
created_utc     0
num_comments    0
selftext        0
url             0
dtype: int64

Row Count: 500

Fixed CSV saved to 'reddit_posts_fixed.csv'
