In [25]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import logging
from pathlib import Path
import pandas as pd
import numpy as np


def setup_logger():
    """Настройка логгера для вывода информации о ходе обработки."""
    logger = logging.getLogger('data_preprocessing')
    logger.setLevel(logging.INFO)
    logger.handlers.clear()
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    fmt = logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')
    ch.setFormatter(fmt)
    logger.addHandler(ch)
    return logger


def find_project_root(start_path: Path = None, marker: str = 'raw_data'):
    """
    Поднимаемся вверх от start_path (или cwd), пока не найдём папку data/raw_data.
    """
    p = Path(start_path or Path.cwd()).resolve()
    for parent in [p] + list(p.parents):
        if (parent / 'data' / marker).is_dir():
            return parent
    raise FileNotFoundError('Не удалось найти корень проекта с папкой data/raw_data')


def setup_paths(base_dir: Path = None):
    """Определяем пути raw/processed относительно корня проекта."""
    project_root = find_project_root(base_dir)
    raw = project_root / 'data' / 'raw_data'
    processed = project_root / 'data' / 'processed_data'
    processed.mkdir(parents=True, exist_ok=True)
    return {'raw': raw, 'processed': processed}


def load_data(paths: dict, logger: logging.Logger):
    """
    Читаем ga_sessions и ga_hits.
    Сначала пытаемся загрузить .pkl, если нет — fallback на .csv.
    """
    def _load(name):
        pkl = paths['raw'] / f'{name}.pkl'
        csv = paths['raw'] / f'{name}.csv'
        if pkl.exists():
            df = pd.read_pickle(pkl)
            logger.info(f'✔ Loaded "{name}" from PKL, shape={df.shape}')
        elif csv.exists():
            df = pd.read_csv(csv)
            logger.info(f'✔ Loaded "{name}" from CSV, shape={df.shape}')
        else:
            msg = f'"{name}" not found in {pkl} or {csv}'
            logger.error(msg)
            raise FileNotFoundError(msg)
        return df

    sessions = _load('ga_sessions')
    hits = _load('ga_hits')
    return sessions, hits


def inspect_df(df: pd.DataFrame, name: str, logger: logging.Logger):
    """Печать базовой статистики: форма, типы, пропуски, дубликаты."""
    logger.info(f'--- Inspecting "{name}" ---')
    logger.info(f'Shape: {df.shape}')
    logger.info('Dtypes:')
    for col, dtype in df.dtypes.items():
        logger.info(f'    {col}: {dtype}')
    nulls = df.isnull().sum()
    if nulls.sum():
        logger.info('Missing values:')
        for col, cnt in nulls[nulls > 0].items():
            logger.info(f'    {col}: {cnt}')
    else:
        logger.info('No missing values.')
    dup = df.duplicated().sum()
    logger.info(f'Duplicate rows: {dup}')
    if dup:
        logger.info('Example duplicates:')
        logger.info(df[df.duplicated()].head(3).to_string())


def process_datetime(df: pd.DataFrame, date_col: str, time_col: str, new_col: str, logger: logging.Logger):
    """Объединяем дату и время в Timestamp, логируем ошибки парсинга."""
    logger.info(f'→ Combining "{date_col}" + "{time_col}" → "{new_col}"')
    df[new_col] = pd.to_datetime(
        df[date_col].astype(str).str.strip() + ' ' + df[time_col].astype(str).str.strip(),
        errors='coerce'
    )
    n_bad = df[new_col].isna().sum()
    if n_bad:
        logger.warning(f'    {n_bad} invalid "{new_col}" generated from {date_col}/{time_col}')
    df.drop([date_col, time_col], axis=1, inplace=True)
    return df


def handle_resolution(df: pd.DataFrame, col: str, logger: logging.Logger):
    """
    Разбираем строку вида 'WIDTHxHEIGHT' в две числовые колонки,
    фильтруем экстремальные значения, логируем статистику.
    """
    logger.info(f'→ Parsing resolution from "{col}"')
    # Заменяем пустые на '0x0'
    df[col] = df[col].fillna('0x0').astype(str)

    # Распакуем в две колонки
    res = df[col].str.split('x', expand=True)
    res.columns = ['w', 'h']
    df['screen_width'] = pd.to_numeric(res['w'], errors='coerce')
    df['screen_height'] = pd.to_numeric(res['h'], errors='coerce')

    # Логируем сколько получилось NaN после парсинга
    nan_w = df['screen_width'].isna().sum()
    nan_h = df['screen_height'].isna().sum()
    logger.info(f'    screen_width NaNs: {nan_w}, screen_height NaNs: {nan_h}')

    # Фильтруем экстремумы
    mask = (df['screen_width'] > 10000) | (df['screen_height'] > 10000)
    if mask.sum():
        logger.warning(f'    {mask.sum()} extreme resolutions reset to NaN')
        df.loc[mask, ['screen_width', 'screen_height']] = np.nan

    df.drop(col, axis=1, inplace=True)
    return df


def drop_duplicates(df: pd.DataFrame, subset: list, logger: logging.Logger):
    """Удаляем дубликаты по ключам subset."""
    before = len(df)
    df = df.drop_duplicates(subset=subset)
    dropped = before - len(df)
    if dropped:
        logger.info(f'    Dropped {dropped} duplicates by {subset}')
    return df


def convert_dtypes(df: pd.DataFrame, logger: logging.Logger):
    """Конвертируем object→category, если уникальных значений мало."""
    for col in df.select_dtypes(include='object'):
        nunq = df[col].nunique()
        if nunq < df.shape[0] * 0.5:
            df[col] = df[col].astype('category')
            logger.info(f'    "{col}" → category ({nunq} unique)')
    return df


def create_target(sessions: pd.DataFrame, hits: pd.DataFrame, logger: logging.Logger, actions: list = None):
    """Добавляем бинарный таргет в sessions по событиям hits."""
    if actions is None:
        actions = ['sub_button_click', 'quiz_show', 'start_chat']
    logger.info(f'→ Creating target for actions: {actions}')
    hits['is_target'] = hits['event_action'].isin(actions)
    dist = hits['is_target'].value_counts(normalize=True).to_dict()
    logger.info(f'    Target ratio in hits: {dist}')
    tgt = (
        hits
        .groupby('session_id')['is_target']
        .any()
        .astype(int)
        .rename('target')
        .reset_index()
    )
    sessions = sessions.merge(tgt, on='session_id', how='left')
    sessions['target'].fillna(0, inplace=True)
    return sessions


def main():
    logger = setup_logger()
    paths = setup_paths()
    sessions, hits = load_data(paths, logger)

    # 1) Предварительный осмотр
    inspect_df(sessions, 'Sessions', logger)
    inspect_df(hits, 'Hits', logger)

    # 2) Обработка дат и времени
    sessions = process_datetime(sessions, 'visit_date', 'visit_time', 'visit_ts', logger)
    hits = process_datetime(hits, 'hit_date', 'hit_time', 'hit_ts', logger)

    # 3) Разбор разрешения экрана
    sessions = handle_resolution(sessions, 'device_screen_resolution', logger)

    # 4) Удаление дубликатов
    sessions = drop_duplicates(sessions, ['session_id'], logger)
    hits = drop_duplicates(hits, ['session_id', 'hit_number'], logger)

    # 5) Создание целевой переменной
    sessions = create_target(sessions, hits, logger)

    # 6) Оптимизация типов
    sessions = convert_dtypes(sessions, logger)
    hits = convert_dtypes(hits, logger)

    # 7) Сохранение результатов
    out_s = paths['processed'] / 'processed_sessions.pkl'
    out_h = paths['processed'] / 'processed_hits.pkl'
    sessions.to_pickle(out_s)
    hits.to_pickle(out_h)
    logger.info(f'✔ Saved processed data: {out_s}, {out_h}')
    logger.info('✅ Preprocessing completed.')


if __name__ == '__main__':
    main()


[2025-04-27 17:01:20,551] INFO: ✔ Loaded "ga_sessions" from PKL, shape=(1860042, 18)
[2025-04-27 17:03:11,034] INFO: ✔ Loaded "ga_hits" from PKL, shape=(15726470, 11)
[2025-04-27 17:03:11,049] INFO: --- Inspecting "Sessions" ---
[2025-04-27 17:03:11,052] INFO: Shape: (1860042, 18)
[2025-04-27 17:03:11,055] INFO: Dtypes:
[2025-04-27 17:03:11,068] INFO:     session_id: object
[2025-04-27 17:03:11,068] INFO:     client_id: object
[2025-04-27 17:03:11,069] INFO:     visit_date: object
[2025-04-27 17:03:11,070] INFO:     visit_time: object
[2025-04-27 17:03:11,071] INFO:     visit_number: int64
[2025-04-27 17:03:11,072] INFO:     utm_source: object
[2025-04-27 17:03:11,072] INFO:     utm_medium: object
[2025-04-27 17:03:11,072] INFO:     utm_campaign: object
[2025-04-27 17:03:11,072] INFO:     utm_adcontent: object
[2025-04-27 17:03:11,073] INFO:     utm_keyword: object
[2025-04-27 17:03:11,073] INFO:     device_category: object
[2025-04-27 17:03:11,073] INFO:     device_os: object
[2025-04