# 01 ‚Äî Coleta & Tratamento

In [None]:
import re, pandas as pd, numpy as np
from pathlib import Path

RAW_PATH = Path('../data/raw/Social Media Engagement Dataset.csv')
PROC_PATH = Path('../data/processed/social_media_clean.csv')

def to_snake(name: str) -> str:
    name = re.sub(r"[^\w]+", "_", name.strip())
    name = re.sub(r"__+", "_", name)
    return name.strip("_").lower()

def gerar_base_sintetica(n=2000, seed=42):
    rng = np.random.default_rng(seed)
    df = pd.DataFrame({
        'likes': rng.integers(0, 5000, n),
        'comments': rng.integers(0, 800, n),
        'shares': rng.integers(0, 600, n),
        'num_hashtags': rng.integers(0, 15, n),
        'caption_length': rng.integers(0, 300, n),
        'media_type': rng.choice(['image','video','carousel'], n),
        'post_hour': rng.integers(0, 24, n),
        'day_of_week': rng.integers(0, 7, n)
    })
    df['engagement_score'] = df['likes'] + 3*df['comments'] + 2*df['shares']
    return df

if RAW_PATH.exists():
    df = pd.read_csv(RAW_PATH)
    print('‚úÖ CSV real encontrado.')
    df.columns = [to_snake(c) for c in df.columns]
else:
    print('‚ö†Ô∏è CSV n√£o encontrado; gerando base sint√©tica.')
    df = gerar_base_sintetica()

def first_col(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    return None

likes_col = first_col(df, ['likes', 'like_count'])
comments_col = first_col(df, ['comments', 'comment_count'])
shares_col = first_col(df, ['shares', 'share_count'])
hashtags_col = first_col(df, ['hashtags', 'hashtag', 'num_hashtags'])
caption_col = first_col(df, ['caption', 'description', 'title', 'text'])
media_col = first_col(df, ['media_type', 'type', 'post_type'])
hour_col = first_col(df, ['post_hour', 'hour', 'posted_hour'])
dow_col = first_col(df, ['day_of_week', 'weekday', 'dow'])

if likes_col and 'likes' != likes_col: df['likes'] = df[likes_col]
if comments_col and 'comments' != comments_col: df['comments'] = df[comments_col]
if shares_col and 'shares' != shares_col: df['shares'] = df[shares_col]

if hashtags_col:
    if df[hashtags_col].dtype == 'O':
        df['num_hashtags'] = df[hashtags_col].astype(str).str.count(r'#')
    else:
        df['num_hashtags'] = df[hashtags_col].fillna(0).astype(int)
else:
    df['num_hashtags'] = 0

df['caption_length'] = df[caption_col].astype(str).str.len() if caption_col else 0
df['media_type'] = df[media_col].astype(str).str.lower() if media_col else 'image'
df['post_hour'] = pd.to_numeric(df[hour_col], errors='coerce').fillna(0).clip(0,23).astype(int) if hour_col else 12
df['day_of_week'] = pd.to_numeric(df[dow_col], errors='coerce').fillna(0).clip(0,6).astype(int) if dow_col else 0

if 'likes' not in df.columns:
    df['engagement_score'] = df.get('engagement_score', df['num_hashtags']*10 + df['caption_length']*0.3).astype(float)
else:
    df['engagement_score'] = df['likes'] + 3*df.get('comments', 0) + 2*df.get('shares', 0)

df['caption_bins'] = pd.cut(df['caption_length'], bins=[-1,60,120,180,9999],
                            labels=['curta','m√©dia','longa','muito_longa'])
df['periodo'] = pd.cut(df['post_hour'], bins=[-1,5,11,17,21,24],
                       labels=['madrugada','manh√£','tarde','noite','late'])

PROC_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(PROC_PATH, index=False, encoding='utf-8-sig')
print('üíæ Salvo em', PROC_PATH)
df.head()
