# Data Cleaning and Preprocessing

This notebook handles data validation, cleaning, and standardization of raw social media posts.

In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load raw data
df = pd.read_csv('../data/raw_posts.csv')
print(f"Loaded {len(df)} rows of raw data")
df.head()

In [None]:
# Data validation
print("Data validation report:")
print(f"- Missing post_ids: {df['post_id'].isnull().sum()}")
print(f"- Missing timestamps: {df['timestamp'].isnull().sum()}")
print(f"- Duplicate post_ids: {df['post_id'].duplicated().sum()}")
print(f"- Negative likes: {(df['likes'] < 0).sum()}")
print(f"- Negative comments: {(df['comments'] < 0).sum()}")
print(f"- Negative shares: {(df['shares'] < 0).sum()}")
print(f"- Negative reach: {(df['reach'] < 0).sum()}")

In [None]:
# Remove duplicates
df_clean = df.drop_duplicates(subset=['post_id'], keep='first')
print(f"Removed {len(df) - len(df_clean)} duplicate rows")

In [None]:
# Clean and standardize data
df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'])
df_clean['platform'] = df_clean['platform'].str.lower().str.strip()
df_clean['post_type'] = df_clean['post_type'].str.lower().str.strip()
df_clean['caption'] = df_clean['caption'].str.strip()

# Handle negative values
df_clean.loc[df_clean['likes'] < 0, 'likes'] = None
df_clean.loc[df_clean['comments'] < 0, 'comments'] = None
df_clean.loc[df_clean['shares'] < 0, 'shares'] = None
df_clean.loc[df_clean['reach'] < 0, 'reach'] = None

# Calculate derived fields
df_clean['interactions'] = df_clean['likes'].fillna(0) + df_clean['comments'].fillna(0) + df_clean['shares'].fillna(0)
df_clean['engagement_rate'] = df_clean['interactions'] / df_clean['reach'].fillna(1)
df_clean['hour'] = df_clean['timestamp'].dt.hour
df_clean['weekday'] = df_clean['timestamp'].dt.weekday
df_clean['caption_length'] = df_clean['caption'].str.len().fillna(0)
df_clean['day'] = df_clean['timestamp'].dt.date

print("Data cleaning completed")
df_clean.head()

In [None]:
# Save cleaned data
df_clean.to_csv('../data/cleaned_posts.csv', index=False)
print("Cleaned data saved to ../data/cleaned_posts.csv")

In [None]:
# Summary statistics
print("Summary of cleaned data:")
print(f"- Total posts: {len(df_clean)}")
print(f"- Platforms: {df_clean['platform'].unique()}")
print(f"- Post types: {df_clean['post_type'].unique()}")
print(f"- Date range: {df_clean['day'].min()} to {df_clean['day'].max()}")
print(f"- Average engagement rate: {df_clean['engagement_rate'].mean():.4f}")