In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import uuid


In [2]:

# Load the data (assuming the CSV content is provided as a string or file)
# For demonstration, I'll simulate loading the data you provided
data = pd.read_csv('reddit_comments.csv')

# Step 2.1: Inspect the data
print("Dataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())
print("\nDuplicate Comments (by comment_id):")
print(data['comment_id'].duplicated().sum())

# Step 2.2: Parse created_utc to datetime
data['created_utc'] = pd.to_datetime(data['created_utc'])

# Step 2.3: Basic EDA
# Number of unique posts and authors
unique_posts = data['post_id'].nunique()
unique_authors = data['author'].nunique()
print(f"\nUnique Posts: {unique_posts}")
print(f"Unique Authors: {unique_authors}")

# Score distribution
print("\nScore Statistics:")
print(data['score'].describe())

# Check for AutoModerator comments (common in Reddit data)
automoderator_comments = data[data['author'] == 'AutoModerator'].shape[0]
print(f"\nAutoModerator Comments: {automoderator_comments}")

# Step 2.4: Save cleaned data for further analysis
data.to_csv('reddit_comments_cleaned.csv', index=False)
print("\nCleaned data saved to 'reddit_comments_cleaned.csv'")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7141 entries, 0 to 7140
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   post_id      7141 non-null   object
 1   comment_id   7141 non-null   object
 2   author       7043 non-null   object
 3   score        7141 non-null   int64 
 4   created_utc  7141 non-null   object
 5   body         7140 non-null   object
dtypes: int64(1), object(5)
memory usage: 334.9+ KB
None

Missing Values:
post_id         0
comment_id      0
author         98
score           0
created_utc     0
body            1
dtype: int64

Duplicate Comments (by comment_id):
0

Unique Posts: 500
Unique Authors: 2268

Score Statistics:
count    7141.000000
mean        1.845820
std         4.664144
min       -25.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       132.000000
Name: score, dtype: float64

AutoModerator Comments: 499

Cleaned data saved to 'reddit_com

In [None]:
import pandas as pd
import csv
import re
from io import StringIO

# Step 1: Read the CSV with robust parsing
try:
    data = pd.read_csv('reddit_comments.csv', quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8', on_bad_lines='warn')
except Exception as e:
    print(f"Error reading CSV: {e}")
    # Fallback: Manual parsing
    with open('reddit_comments.csv', 'r', encoding='utf-8') as file:
        raw_data = file.read()
    csv_buffer = StringIO(raw_data)
    data = pd.read_csv(csv_buffer, quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8', on_bad_lines='warn')

# Step 2: Text Preprocessing for body column (NLP-inspired, no NLTK)
def clean_body_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    # Encode to handle emojis and non-ASCII characters
    text = text.encode('utf-8', errors='ignore').decode('utf-8')
    # Replace newlines and carriage returns with spaces
    text = re.sub(r'[\n\r]+', ' ', text)
    # Escape double quotes for CSV compatibility
    text = text.replace('"', '""')
    # Remove or normalize problematic characters (e.g., multiple spaces, tabs)
    text = re.sub(r'\s+', ' ', text)
    # Handle common Reddit formatting (e.g., URLs, mentions)
    text = re.sub(r'https?://\S+', '[URL]', text)  # Simplify URLs
    text = re.sub(r'u/\S+', '[USER]', text)  # Simplify user mentions
    # Trim leading/trailing spaces
    cleaned_text = text.strip()
    return cleaned_text

# Apply cleaning to body column
data['body'] = data['body'].apply(clean_body_text)

# Step 3: Validate and clean other columns
# Convert created_utc to datetime
data['created_utc'] = pd.to_datetime(data['created_utc'], errors='coerce')

# Drop rows with missing critical fields
data = data.dropna(subset=['comment_id', 'post_id'])

# Check for duplicates
print("Duplicate comment_ids:", data['comment_id'].duplicated().sum())

# Step 4: Basic validation
print("\nDataset Info:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())
print("\nRow Count:", len(data))

# Step 5: Save the corrected CSV
data.to_csv('reddit_comments_fixed.csv', index=False, quoting=csv.QUOTE_ALL, escapechar='\\', encoding='utf-8')
print("\nFixed CSV saved to 'reddit_comments_fixed.csv'")

Duplicate comment_ids: 0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7141 entries, 0 to 7140
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   post_id      7141 non-null   object        
 1   comment_id   7141 non-null   object        
 2   author       7043 non-null   object        
 3   score        7141 non-null   int64         
 4   created_utc  7141 non-null   datetime64[ns]
 5   body         7141 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 334.9+ KB
None

Missing Values:
post_id         0
comment_id      0
author         98
score           0
created_utc     0
body            0
dtype: int64

Row Count: 7141

Fixed CSV saved to 'reddit_comments_fixed.csv'
