## 리뷰수 outlier 그래프 및 데이터 프레임 추출

In [None]:
%cd ../
%cd O2O-Text-Analysis-project

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd

### Sentiment Data Load

In [None]:
df_yogiyo = pd.read_csv('result/yogiyo_sentiment_analyzed_with_model.csv')
df_ddae = pd.read_csv('result/yeogi_sentiment_analyzed_with_model.csv')
df_nol = pd.read_csv('result/nol_sentiment_analyzed_with_model.csv')
df_skyscanner = pd.read_csv('result/skyscanner_sentiment_analyzed_with_model.csv')
df_kakao = pd.read_csv('result/kakao_taxi_sentiment_analyzed_with_model.csv')
df_uber = pd.read_csv('result/uber_taxi_sentiment_analyzed_with_model.csv')

### 이상치 그래프 저장

In [None]:
def save_outlier_plots_by_df_dict(df_dict, years=range(2015, 2026), method='iqr', thresh=1.5, base_dir='outlier_plots'):

    for name, df in df_dict.items():
        print(f"\n📊 처리 중: {name}")
        save_dir = os.path.join(base_dir, name)
        os.makedirs(save_dir, exist_ok=True)

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])

        for year in years:
            plt.figure(figsize=(18, 5))
            year_df = df[df['date'].dt.year == year]

            if year_df.empty:
                print(f"⚠️ {name} - {year} 데이터 없음, 스킵")
                plt.close()
                continue

            daily_counts = year_df['date'].dt.strftime('%m-%d').value_counts().sort_index()
            all_days = pd.date_range(f'{year}-01-01', f'{year}-12-31')
            all_mmdd = all_days.strftime('%m-%d')
            daily_counts = daily_counts.reindex(all_mmdd, fill_value=0)
            vals = daily_counts.values

            if method == 'iqr':
                q1 = pd.Series(vals).quantile(0.25)
                q3 = pd.Series(vals).quantile(0.75)
                iqr = q3 - q1
                lower = q1 - thresh * iqr
                upper = q3 + thresh * iqr
                outlier_idx = (vals < lower) | (vals > upper)
            else:  # std
                mean = vals.mean()
                std = vals.std()
                lower = mean - thresh * std
                upper = mean + thresh * std
                outlier_idx = (vals < lower) | (vals > upper)

            plt.plot(all_mmdd, vals, label=f'{year}')
            plt.scatter(pd.Series(all_mmdd)[outlier_idx], vals[outlier_idx], color='red', s=40, zorder=10, label=f'{year} 이상치')

            plt.title(f'{name} - {year} 연도별 일별 리뷰 개수 (이상치 표시)')
            plt.xlabel('날짜 (MM-DD)')
            plt.ylabel('리뷰 개수')
            plt.legend()
            plt.xticks(rotation=45, fontsize=8, ticks=range(0, 366, 15))
            plt.tight_layout()
            plt.grid(alpha=0.2)

            filename = os.path.join(save_dir, f'{name}_{year}.png')
            plt.savefig(filename)
            plt.close()
            print(f"✅ 저장됨: {filename}")

### 이상치 Data 추출

In [None]:
def merge_intervals(intervals):
    if not intervals:
        return []
    sorted_intervals = sorted(intervals, key=lambda x: x[0])
    merged = [list(sorted_intervals[0])]
    for start, end in sorted_intervals[1:]:
        last_start, last_end = merged[-1]
        if start <= last_end:
            merged[-1][1] = max(last_end, end)
        else:
            merged.append([start, end])
    return merged

def extract_merged_outlier_segments(
    df_dict,
    years=range(2015, 2026),
    method='iqr',
    thresh=1.5,
    window=14,
    output_dir='outlier_segments'
):
    os.makedirs(output_dir, exist_ok=True)

    for name, df in df_dict.items():
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])

        outlier_dates = []
        for year in years:
            df_year = df[df['date'].dt.year == year]
            if df_year.empty:
                continue

            counts = df_year['date'].dt.date.value_counts().sort_index()
            all_days = pd.date_range(f'{year}-01-01', f'{year}-12-31').date
            counts = pd.Series(counts, index=all_days).fillna(0)
            vals = counts.values

            if method == 'iqr':
                q1, q3 = pd.Series(vals).quantile([0.25, 0.75])
                iqr = q3 - q1
                lower, upper = q1 - thresh * iqr, q3 + thresh * iqr
            else:
                mean, std = vals.mean(), vals.std()
                lower, upper = mean - thresh * std, mean + thresh * std

            outliers = counts[(counts < lower) | (counts > upper)].index
            outlier_dates.extend(pd.to_datetime(outliers))

        outlier_dates = sorted(set(outlier_dates))

        raw_windows = [(d - pd.Timedelta(days=window), d + pd.Timedelta(days=window)) for d in outlier_dates]
        merged_windows = merge_intervals(raw_windows)

        for start, end in merged_windows:
            segment_df = df[(df['date'] >= start) & (df['date'] <= end)].copy()
            if not segment_df.empty:
                filename = f"result/{name}_{start.date()}~{end.date()}.csv"
                segment_df.to_csv(os.path.join(output_dir, filename), index=False)

### 실행

In [None]:
df_dict = {
    "ddae": df_ddae,
    "nol": df_nol,
    "skyscanner": df_skyscanner,
    "yogiyo": df_yogiyo,
    "kakao": df_kakao,
    "uber": df_uber
}

save_outlier_plots_by_df_dict(df_dict, years=range(2015, 2026))
extract_merged_outlier_segments(df_dict)