# S01 — EDA: inDrive Geotracks (Astana)

Этот ноутбук выполняет полный первичный анализ данных геотреков:
- загрузка и базовая проверка качества,
- описательная статистика,
- пространственные визуализации (scatter/hexbin),
- распределения скорости/высоты,
- анализ направлений (азимут).

In [None]:
# --- Imports & setup ---
import os, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Настройки matplotlib
plt.rcParams["figure.dpi"] = 150

# Путь к данным
DATA_PATH = "../data/raw/geo_locations_astana_hackathon"  # при необходимости замените на абсолютный
print("Using data from:", DATA_PATH)

# Чтение данных
read_kwargs = dict(
    sep=",",
    engine="c",
    dtype={
        "randomized_id": "int64",
        "lat": "float64",
        "lng": "float64",
        "alt": "float64",
        "spd": "float64",
        "azm": "float64",
    },
    header=0,
)
try:
    sample = pd.read_csv(DATA_PATH, nrows=10, **read_kwargs)
    exp = ["randomized_id","lat","lng","alt","spd","azm"]
    if list(sample.columns[:6]) != exp:
        read_kwargs.update({"header": None, "names": exp})
except Exception:
    exp = ["randomized_id","lat","lng","alt","spd","azm"]
    read_kwargs.update({"header": None, "names": exp})

df = pd.read_csv(DATA_PATH, **read_kwargs)
df.head(3)

## 1. Shape, memory, missing, duplicates

In [None]:
n_rows, n_cols = df.shape
memory_mb = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Rows: {n_rows:,}  |  Cols: {n_cols}  |  Memory: {memory_mb:.2f} MB")

missing = df.isna().sum().rename("missing")
dups = df.duplicated().sum()
display(missing.to_frame())
print("Duplicate full rows:", dups)

## 2. Basic ranges and sanity checks

In [None]:
lat_min, lat_max = df["lat"].min(), df["lat"].max()
lng_min, lng_max = df["lng"].min(), df["lng"].max()
print(f"Latitude range: [{lat_min:.6f}, {lat_max:.6f}]")
print(f"Longitude range: [{lng_min:.6f}, {lng_max:.6f}]")

neg_speed = int((df["spd"] < 0).sum())
high_speed = int((df["spd"] > 60).sum())  # >60 m/s sanity
invalid_lat = int(((df["lat"] < -90) | (df["lat"] > 90)).sum())
invalid_lng = int(((df["lng"] < -180) | (df["lng"] > 180)).sum())
invalid_azm = int(((df["azm"] < 0) | (df["azm"] >= 360)).sum())

print("Negative speeds:", neg_speed)
print("Speeds > 60 m/s:", high_speed)
print("Invalid lat:", invalid_lat, " | Invalid lng:", invalid_lng, " | Invalid azm:", invalid_azm)

## 3. Descriptive stats (altitude, speed, azimuth)

In [None]:
def describe_series(s):
    q = s.quantile([0.01,0.05,0.25,0.5,0.75,0.95,0.99])
    return pd.Series({
        "min": s.min(),
        "p01": q.loc[0.01],
        "p05": q.loc[0.05],
        "p25": q.loc[0.25],
        "p50": q.loc[0.5],
        "p75": q.loc[0.75],
        "p95": q.loc[0.95],
        "p99": q.loc[0.99],
        "max": s.max(),
        "mean": s.mean(),
        "std": s.std(),
    })

display(describe_series(df["alt"]).to_frame("alt"))
display(describe_series(df["spd"]).to_frame("spd"))
display(describe_series((df["azm"]%360)).to_frame("azm"))

## 4. Points per randomized_id

In [None]:
id_counts = df["randomized_id"].value_counts(dropna=False)
summary = pd.Series({
    "unique_ids": id_counts.shape[0],
    "min": id_counts.min(),
    "p25": id_counts.quantile(0.25),
    "median": id_counts.quantile(0.5),
    "p75": id_counts.quantile(0.75),
    "p95": id_counts.quantile(0.95),
    "max": id_counts.max(),
})
display(summary.to_frame("points_per_id"))
id_counts.head(10).rename_axis("randomized_id").to_frame("points").head(10)

## 5. Spatial visuals (scatter & hexbin)

In [None]:
import matplotlib.pyplot as plt

# 5.1 Scatter sample
plt.figure(figsize=(64,64))
sample_n = min(len(df), 1200000)
sample_df = df.sample(n=sample_n, random_state=42) if len(df) > sample_n else df
plt.scatter(sample_df["lng"], sample_df["lat"], s=2)
plt.xlabel("Longitude", fontsize = 64); plt.ylabel("Latitude", fontsize = 64); plt.title("Scatter of Coordinates (sample)")
plt.tick_params(axis='x', labelsize=48)
plt.tick_params(axis='y', labelsize=48)
plt.show()

# 5.2 Hexbin density (log)
plt.figure(figsize=(75,75))
hb = plt.hexbin(df["lng"].values, df["lat"].values, gridsize=1300, bins="log")
plt.xlabel("Longitude", fontsize = 64); plt.ylabel("Latitude", fontsize = 64); plt.title("Spatial Density (hexbin, log scale)")
plt.tick_params(axis='x', labelsize=48)
plt.tick_params(axis='y', labelsize=48)
plt.show()

## 6. Distributions: speed & altitude (clipped 1–99%)

In [None]:
plt.figure(figsize=(7,4))
spd_clip = df["spd"].clip(lower=df["spd"].quantile(0.01), upper=df["spd"].quantile(0.99))
plt.hist(spd_clip, bins=80)
plt.xlabel("Speed (m/s) [clipped 1–99%]"); plt.ylabel("Count"); plt.title("Speed Distribution (clipped)")
plt.show()

plt.figure(figsize=(7,4))
alt_clip = df["alt"].clip(lower=df["alt"].quantile(0.01), upper=df["alt"].quantile(0.99))
plt.hist(alt_clip, bins=80)
plt.xlabel("Altitude (m) [clipped 1–99%]"); plt.ylabel("Count"); plt.title("Altitude Distribution (clipped)")
plt.show()

## 7. Direction analysis (Azimuth rose)

In [None]:
az = (df["azm"] % 360).values
theta = np.deg2rad(az)
plt.figure(figsize=(6,6))
ax = plt.subplot(111, polar=True)
n_bins = 36
counts, bin_edges = np.histogram(theta, bins=n_bins, range=(0, 2*np.pi))
width = (2*np.pi) / n_bins
centers = (bin_edges[:-1] + bin_edges[1:]) / 2.0
ax.bar(centers, counts, width=width, bottom=0.0)
ax.set_title("Azimuth Rose (direction histogram)")
plt.show()