In [None]:
# notebooks/EDA/eda.ipynb

# --- –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –æ–∫—Ä—É–∂–µ–Ω–∏—è ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö ---
sessions = pd.read_pickle('../../data/processed_data/processed_sessions.pkl')
hits = pd.read_pickle('../../data/processed_data/processed_hits.pkl')

print(f"‚úÖ sessions.shape = {sessions.shape}")
print(f"‚úÖ hits.shape = {hits.shape}")

# --- 1. –ü–µ—Ä–≤–∏—á–Ω—ã–π –æ–±–∑–æ—Ä ---
print("\n--- Sessions Info ---")
print(sessions.info())
print("\n--- Hits Info ---")
print(hits.info())

print("\n--- Sessions Target Distribution ---")
print(sessions['target'].value_counts(normalize=True))

# --- 2. –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ü–µ–ª–µ–≤–æ–π –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π ---
plt.figure()
sessions['target'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Target Distribution')
plt.xlabel('Target (0/1)')
plt.ylabel('Count')
plt.show()

# --- 3. –ß–∏—Å–ª–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ ---
numeric_features = ['visit_number', 'screen_width', 'screen_height']

print("\n--- Numeric Features Description ---")
print(sessions[numeric_features].describe())

for col in numeric_features:
    plt.figure()
    sns.histplot(sessions[col].dropna(), kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

# --- 4. –ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ ---
categorical_features = [
    'utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent', 'utm_keyword',
    'device_category', 'device_os', 'device_brand', 'device_model',
    'device_browser', 'geo_country', 'geo_city'
]

for col in categorical_features:
    if sessions[col].nunique() < 30:  # —á—Ç–æ–±—ã –Ω–µ —Å—Ç—Ä–æ–∏—Ç—å —Å–ª–∏—à–∫–æ–º –º–Ω–æ–≥–æ –±–∞—Ä–æ–≤
        plt.figure()
        sns.countplot(data=sessions, y=col, order=sessions[col].value_counts().index)
        plt.title(f'Distribution of {col}')
        plt.xlabel('Count')
        plt.ylabel(col)
        plt.show()

# --- 5. –ö–æ—Ä—Ä–µ–ª—è—Ü–∏—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ —Å —Ç–∞—Ä–≥–µ—Ç–æ–º ---
corr_matrix = sessions[numeric_features + ['target']].corr()

plt.figure()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# --- 6. –ü–æ–≤–µ–¥–µ–Ω–∏–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π ---
# –°–∫–æ–ª—å–∫–æ —Å–æ–±—ã—Ç–∏–π –Ω–∞ —Å–µ—Å—Å–∏—é?
events_per_session = hits.groupby('session_id').size().rename('n_events')

# –û–±—ä–µ–¥–∏–Ω—è–µ–º
sessions = sessions.join(events_per_session, on='session_id').fillna(0)

# –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–æ–±—ã—Ç–∏–π
plt.figure()
sns.histplot(sessions['n_events'], bins=50, kde=True)
plt.xlim(0, 100)
plt.title('Number of Events per Session')
plt.xlabel('Number of Events')
plt.ylabel('Frequency')
plt.show()

# Boxplot –ø–æ —Ç–∞—Ä–≥–µ—Ç—É
plt.figure()
sns.boxplot(x='target', y='n_events', data=sessions)
plt.title('Events per Session by Target')
plt.xlabel('Target')
plt.ylabel('Number of Events')
plt.ylim(0, 100)
plt.show()

# --- 7. –¢–û–ü —Å—Ç—Ä–∞–Ω–∏—á–µ–∫ –∏ —Ä–µ—Ñ–µ—Ä–µ—Ä–æ–≤ ---
print("\n--- Top Pages ---")
print(hits['hit_page_path'].value_counts().head(10))

plt.figure()
hits['hit_page_path'].value_counts().head(10).plot(kind='barh')
plt.title('Top 10 Pages')
plt.xlabel('Hits')
plt.ylabel('Page Path')
plt.show()

print("\n--- Top Referers ---")
print(hits['hit_referer'].value_counts().head(10))

plt.figure()
hits['hit_referer'].value_counts().head(10).plot(kind='barh', color='orange')
plt.title('Top 10 Referers')
plt.xlabel('Hits')
plt.ylabel('Referer')
plt.show()

# --- 8. –ê–Ω–∞–ª–∏–∑ –¥–µ–π—Å—Ç–≤–∏–π (event_action) ---
print("\n--- Top Event Actions ---")
print(hits['event_action'].value_counts().head(10))

plt.figure()
hits['event_action'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Event Actions')
plt.xlabel('Event Action')
plt.ylabel('Count')
plt.show()

# --- 9. –ö–æ–Ω–µ—á–Ω—ã–π –≤—ã–≤–æ–¥ ---
print("\n--- Summary ---")
print(f"–í—Å–µ–≥–æ —Å–µ—Å—Å–∏–π: {sessions.shape[0]}")
print(f"–í—Å–µ–≥–æ —Å–æ–±—ã—Ç–∏–π: {hits.shape[0]}")
print(f"–ö–æ–Ω–≤–µ—Ä—Å–∏—è (target=1): {sessions['target'].mean():.4f}")
print(f"–°—Ä–µ–¥–Ω–µ–µ —á–∏—Å–ª–æ —Å–æ–±—ã—Ç–∏–π –Ω–∞ —Å–µ—Å—Å–∏—é: {sessions['n_events'].mean():.2f}")

print("\nüìä –ë–∞–∑–æ–≤—ã–π EDA –∑–∞–≤–µ—Ä—à—ë–Ω.")
