# **Практика 1**

In [2]:
!pip install requests-cache

Collecting requests-cache
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache)
  Downloading cattrs-25.2.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.2.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.0/70.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading url_normalize-2.2.1-py3-none-any.whl (14 kB)
Installing collected packages: url-normalize, cattrs, requests-cache
Successfully installed cattrs-25.2.0 requests-cache-1.2.1 url-normalize-2.2.1


In [3]:
!pip install retry-requests

Collecting retry-requests
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Downloading retry_requests-2.0.0-py3-none-any.whl (15 kB)
Installing collected packages: retry-requests
Successfully installed retry-requests-2.0.0


In [4]:
import requests
import pandas as pd
import numpy as np
import requests_cache
from retry_requests import retry
import logging
from typing import Optional, List, Dict, Tuple
from datetime import datetime, timedelta
import json
from urllib.parse import urlparse, parse_qs
import re
import os

# Временной ряд
# https://open-meteo.com/
# https://open-meteo.com/en/docs?hourly=temperature_2m,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_speed_180m,wind_direction_10m,wind_direction_80m,wind_direction_180m,wind_gusts_10m,wind_direction_120m,precipitation,relative_humidity_2m,weather_code,snowfall,snow_depth,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,rain,showers,vapour_pressure_deficit,apparent_temperature,dew_point_2m&timezone=Europe%2FMoscow&latitude=55.7522&longitude=#hourly_weather_variables

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('storm_warning_system.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Настройка кэширования и повторных попыток
cache_session = requests_cache.CachedSession('.cache', expire_after=86400)  # 1 день
retry_session = retry(cache_session, retries=5, backoff_factor=0.3)


def parse_parameters_from_url(url: str) -> Tuple[Optional[float], Optional[float], List[str], Optional[str]]:
    try:
        parsed = urlparse(url)
        query = parse_qs(parsed.query)

        latitude = float(query.get('latitude', [None])[0]) if query.get('latitude') else None
        longitude = float(query.get('longitude', [None])[0]) if query.get('longitude') else None

        hourly_params = []
        if 'hourly' in query:
            hourly_value = query['hourly'][0]
            hourly_params = hourly_value.split(',') if hourly_value else []

        timezone = query.get('timezone', [None])[0]
        if timezone:
            timezone = requests.utils.unquote(timezone)

        logger.info(f"Извлечено из URL: lat={latitude}, lon={longitude}, tz={timezone}")
        logger.info(f"Параметры hourly: {hourly_params}")

        return latitude, longitude, hourly_params, timezone

    except Exception as e:
        logger.error(f"Ошибка при парсинге URL: {e}")
        return None, None, [], None


def get_historical_weather(
        latitude: float,
        longitude: float,
        start_date: str,
        end_date: str,
        hourly_params: Optional[List[str]] = None,
        timezone: str = "Europe/Moscow"
) -> Optional[pd.DataFrame]:
    url = "https://archive-api.open-meteo.com/v1/archive"

    params_from_url = [
        "temperature_2m", "wind_speed_10m", "wind_speed_80m", "wind_speed_120m",
        "wind_speed_180m", "wind_direction_10m", "wind_direction_80m",
        "wind_direction_180m", "wind_gusts_10m", "wind_direction_120m",
        "precipitation", "relative_humidity_2m", "pressure_msl", "weather_code", "snowfall",
        "snow_depth", "cloud_cover", "cloud_cover_low", "cloud_cover_mid",
        "cloud_cover_high", "rain", "showers", "vapour_pressure_deficit",
        "apparent_temperature", "dew_point_2m", "et0_fao_evapotranspiration"
    ]

    final_params = hourly_params if hourly_params else params_from_url

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ",".join(final_params),
        "timezone": timezone
    }

    try:
        logger.info(f"Запрос к API: {url}")
        logger.info(f"Параметры: {params}")

        response = retry_session.get(url, params=params, timeout=60)
        response.raise_for_status()

        data = response.json()

        if data.get("error"):
            logger.error(f"API вернул ошибку: {data['reason']}")
            return None

        if "hourly" not in data or not data["hourly"]:
            logger.warning("Нет данных в ключе 'hourly'")
            return None

        df = pd.DataFrame(data["hourly"])

        df["time"] = pd.to_datetime(df["time"])
        df.set_index("time", inplace=True)

        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        logger.info(f"Успешно загружено: {len(df)} записей, {len(df.columns)} переменных")
        logger.info(f"Столбцы: {list(df.columns)}")

        return df

    except requests.exceptions.HTTPError as e:
        logger.error(f"HTTP ошибка: {e}")
        if hasattr(e, 'response') and e.response is not None:
            logger.error(f"Response content: {e.response.text}")
        return None
    except requests.exceptions.Timeout:
        logger.error("Таймаут запроса к API")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Ошибка соединения: {e}")
        return None
    except Exception as e:
        logger.error(f"Неизвестная ошибка: {e}")
        return None


def create_storm_labels(
        df: pd.DataFrame,
        wind_threshold: float = 15.0,
        gust_threshold: float = 25.0,
        precip_threshold: float = 7.0,
        pressure_drop_window: int = 3,
        pressure_drop_threshold: float = 4.0,
        include_weather_code: bool = True
) -> pd.DataFrame:
    if df is None or df.empty:
        return df

    df = df.copy()
    storm_conditions = pd.Series(False, index=df.index)

    # 1. Сильный ветер на разных высотах
    wind_columns = [col for col in df.columns if col.startswith('wind_speed_')]
    for wind_col in wind_columns:
        storm_conditions |= (df[wind_col] >= wind_threshold)

    # 2. Порывы ветра
    if "wind_gusts_10m" in df.columns:
        storm_conditions |= (df["wind_gusts_10m"] >= gust_threshold)

    # 3. Сильные осадки
    precip_columns = ["precipitation", "rain", "showers"]
    for precip_col in precip_columns:
        if precip_col in df.columns:
            storm_conditions |= (df[precip_col] >= precip_threshold)

    # 4. Быстрое падение давления (штормовой признак)
    pressure_col = None
    for col in ["pressure_msl", "surface_pressure"]:
        if col in df.columns:
            pressure_col = col
            break

    if pressure_col:
        delta_p = df[pressure_col].diff(periods=pressure_drop_window)
        rapid_drop = delta_p < -pressure_drop_threshold
        storm_conditions |= rapid_drop.fillna(False)

    # 5. Опасные погодные коды
    if include_weather_code and "weather_code" in df.columns:
        storm_codes = [65, 75, 82, 85, 86, 95, 96, 99]
        storm_conditions |= df["weather_code"].isin(storm_codes)

    # 6. Высокая облачность (признак неустойчивости)
    cloud_columns = [col for col in df.columns if col.startswith('cloud_cover')]
    for cloud_col in cloud_columns:
        if cloud_col in df.columns:
            storm_conditions |= (df[cloud_col] >= 80)  # Облачность > 80%

    df["is_storm"] = storm_conditions.astype(int)
    storm_count = df["is_storm"].sum()
    logger.info(f"Создано меток шторма: {storm_count} из {len(df)} ({storm_count / len(df) * 100:.2f}%)")

    return df


def save_results(df: pd.DataFrame, filename: str, metadata: Optional[Dict] = None):
    try:
        os.makedirs('data', exist_ok=True)
        filepath = f"data/{filename}"

        df.to_csv(filepath, encoding='utf-8')
        logger.info(f"Данные сохранены: {filepath}")

        if metadata:
            metapath = filepath.replace('.csv', '_metadata.json')
            with open(metapath, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
            logger.info(f"Метаданные сохранены: {metapath}")

    except Exception as e:
        logger.error(f"Ошибка при сохранении: {e}")


def main():
    source_url = (
        "https://open-meteo.com/en/docs?hourly=temperature_2m,wind_speed_10m,wind_speed_80m,"
        "wind_speed_120m,wind_speed_180m,wind_direction_10m,wind_direction_80m,wind_direction_180m,"
        "wind_gusts_10m,wind_direction_120m,precipitation,relative_humidity_2m,pressure_msl,weather_code,"
        "snowfall,snow_depth,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,rain,"
        "showers,vapour_pressure_deficit,apparent_temperature,dew_point_2m,et0_fao_evapotranspiration&timezone=Europe%2FMoscow&"
        "latitude=55.7522&longitude=37.6156"
    )

    latitude, longitude, hourly_params, timezone = parse_parameters_from_url(source_url)

    if latitude is None or longitude is None:
        latitude, longitude = 55.7522, 37.6156
        logger.warning("Используются координаты по умолчанию: Москва")

    if not timezone:
        timezone = "Europe/Moscow"
        logger.warning("Используется временная зона по умолчанию: Europe/Moscow")

    start_date = '2022-01-01'
    end_date = '2025-09-18'

    logger.info("Запуск парсера погодных данных Open-Meteo")
    logger.info(f"Координаты: {latitude}, {longitude} | Таймзона: {timezone}")
    logger.info(f"Период: {start_date} — {end_date}")
    logger.info(f"Запрашиваемые параметры: {hourly_params}")

    df = get_historical_weather(
        latitude=latitude,
        longitude=longitude,
        start_date=start_date,
        end_date=end_date,
        hourly_params=hourly_params,
        timezone=timezone
    )

    if df is None or df.empty:
        logger.error("Не удалось получить данные или данные пустые.")
        return

    labeled_df = create_storm_labels(
        df,
        wind_threshold=15,
        gust_threshold=25,
        precip_threshold=7,
        pressure_drop_window=3,
        pressure_drop_threshold=4.0,
        include_weather_code=True
    )

    save_results(
        labeled_df,
        "storm_data.csv",
        metadata={
            "source_url": source_url,
            "location": {"lat": latitude, "lon": longitude},
            "timezone": timezone,
            "date_range": {"start": start_date, "end": end_date},
            "hourly_params": hourly_params,
            "storm_criteria": {
                "wind_speed_* >= km/h": 15,
                "wind_gusts_10m >= km/h": 25,
                "precipitation/rain/showers >= mm/h": 7,
                "pressure_drop >= hPa/3h": 4.0,
                "storm_weather_codes": [65, 75, 82, 85, 86, 95, 96, 99],
                "cloud_cover >= %": 80
            },
            "generated_at": datetime.now().isoformat(),
            "total_rows": len(labeled_df),
            "storm_events_count": int(labeled_df["is_storm"].sum()),
            "columns_list": list(labeled_df.columns)
        }
    )

    logger.info(f"Общее количество записей: {len(labeled_df)}")
    logger.info(f"Количество штормовых событий: {labeled_df['is_storm'].sum()}")
    logger.info(f"Доля штормовых событий: {labeled_df['is_storm'].mean() * 100:.2f}%")
    logger.info(f"Период данных: от {labeled_df.index.min()} до {labeled_df.index.max()}")

    logger.info("Готово: данные успешно получены и сохранены")


if __name__ == "__main__":
    main()


In [5]:
import requests
import time
import csv
from typing import List, Dict, Any

# Многомерный ряд
# https://pokeapi.co

BASE_URL = "https://pokeapi.co/api/v2"


def get_json(url: str) -> Dict[str, Any]:
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        return resp.json()
    except requests.exceptions.RequestException as e:
        print(f"Ошибка при запросе {url}: {e}")
        return {}


def get_pokemon_list(limit: int = 100, offset: int = 0) -> List[Dict[str, Any]]:
    url = f"{BASE_URL}/pokemon?limit={limit}&offset={offset}"
    data = get_json(url)
    return data.get("results", [])


def get_encounter_locations(pokemon_url: str) -> str:
    location_url = pokemon_url + "/encounters"
    try:
        locations = get_json(location_url)
        if locations:
            names = {
                loc.get("location_area", {}).get("name", "unknown")
                for loc in locations if loc.get("location_area")
            }
            return ", ".join(sorted(names)[:10])
        return ""
    except:
        return ""


def get_evolution_chain(evolution_chain_url: str) -> str:
    if not evolution_chain_url:
        return ""

    data = get_json(evolution_chain_url)
    if not data:
        return ""

    def parse_chain(chain):
        name = chain.get("species", {}).get("name", "")
        evolves_to = chain.get("evolves_to", [])
        if not evolves_to:
            return name
        next_names = [parse_chain(evo) for evo in evolves_to]
        return " -> ".join([name] + next_names)

    return parse_chain(data.get("chain", {}))


def get_pokemon_details(pokemon_url: str) -> Dict[str, Any]:
    data = get_json(pokemon_url)
    if not data:
        return {}

    species_url = data.get("species", {}).get("url")
    species_data = get_json(species_url) if species_url else {}

    stats = {}
    for s in data.get("stats", []):
        stat_name = s["stat"]["name"]
        stats[stat_name] = s["base_stat"]

    types = [t["type"]["name"] for t in data.get("types", [])]
    type_1 = types[0] if len(types) > 0 else ""
    type_2 = types[1] if len(types) > 1 else ""

    # Способности
    abilities = []
    hidden_ability = ""
    for ab in data.get("abilities", []):
        if ab.get("is_hidden"):
            hidden_ability = ab["ability"]["name"]
        else:
            abilities.append(ab["ability"]["name"])
    ability_1 = abilities[0] if len(abilities) > 0 else ""
    ability_2 = abilities[1] if len(abilities) > 1 else ""

    moves = [m["move"]["name"] for m in data.get("moves", [])[:5]]
    while len(moves) < 5:
        moves.append("")

    return {
        "id": data.get("id"),
        "name": data.get("name"),
        "base_experience": data.get("base_experience"),
        "height": data.get("height"),
        "weight": data.get("weight"),
        "order": data.get("order"),

         # Типы
        "type_1": type_1,
        "type_2": type_2,

        # Способности
        "ability_1": ability_1,
        "ability_2": ability_2,
        "hidden_ability": hidden_ability,

        "hp": stats.get("hp", 0),
        "attack": stats.get("attack", 0),
        "defense": stats.get("defense", 0),
        "special_attack": stats.get("special-attack", 0),
        "special_defense": stats.get("special-defense", 0),
        "speed": stats.get("speed", 0),

        # Движения (по столбцам)
        "move_1": moves[0],
        "move_2": moves[1],
        "move_3": moves[2],
        "move_4": moves[3],
        "move_5": moves[4],

        # Спрайты
        "sprite_default": data.get("sprites", {}).get("front_default"),
        "sprite_shiny": data.get("sprites", {}).get("front_shiny"),
        "sprite_artwork": data.get("sprites", {}).get("other", {}).get("official-artwork", {}).get("front_default"),

        # Species
        "color": species_data.get("color", {}).get("name"),
        "generation": species_data.get("generation", {}).get("name"),
        "habitat": species_data.get("habitat", {}).get("name") if species_data.get("habitat") else "",
        "shape": species_data.get("shape", {}).get("name"),

        "is_legendary": 1 if species_data.get("is_legendary", False) else 0,
        "is_mythical": 1 if species_data.get("is_mythical", False) else 0,
        "capture_rate": species_data.get("capture_rate"),
        "base_happiness": species_data.get("base_happiness"),
        "growth_rate": species_data.get("growth_rate", {}).get("name"),
        "egg_groups": ", ".join([eg["name"] for eg in species_data.get("egg_groups", [])]),

        # Эволюция
        "evolution_chain": get_evolution_chain(species_data.get("evolution_chain", {}).get("url")),

        # Локации
        "encounter_locations": get_encounter_locations(pokemon_url),
    }


def parse_all_pokemon(total_limit: int = 100, pause_sec: float = 0.1) -> List[Dict[str, Any]]:
    all_pokemon = []
    offset = 0
    batch_size = 100

    print(f"Начинаем сбор данных о {total_limit} покемонах...")

    while len(all_pokemon) < total_limit:
        remaining = total_limit - len(all_pokemon)
        current_batch = min(batch_size, remaining)

        batch = get_pokemon_list(limit=current_batch, offset=offset)
        if not batch:
            break

        for pokemon in batch:
            details = get_pokemon_details(pokemon["url"])
            if details:
                all_pokemon.append(details)
                print(f"{details['id']:>3} | {details['name']:<12} | {details['type_1']}/{details['type_2']}")

            time.sleep(pause_sec)

        offset += len(batch)

    print(f"Готово: {len(all_pokemon)} покемонов обработано.")
    return all_pokemon


def save_to_csv(data: List[Dict[str, Any]], filename: str = "pokemon_data.csv"):
    if not data:
        print("Нет данных для сохранения.")
        return

    fieldnames = [
        "id", "name", "base_experience", "height", "weight", "order",
        "type_1", "type_2",
        "hp", "attack", "defense", "special_attack", "special_defense", "speed",
        "ability_1", "ability_2", "hidden_ability",
        "move_1", "move_2", "move_3", "move_4", "move_5",
        "sprite_default", "sprite_shiny", "sprite_artwork",
        "color", "generation", "habitat", "shape",
        "is_legendary", "is_mythical", "capture_rate", "base_happiness",
        "growth_rate", "egg_groups", "evolution_chain", "encounter_locations"
    ]

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

    print(f"Данные сохранены в '{filename}' ({len(data)} записей, {len(fieldnames)} столбцов)")


def main():
    pokemon_data = parse_all_pokemon(total_limit=100, pause_sec=0.1)
    save_to_csv(pokemon_data, "pokemon_data.csv")


if __name__ == "__main__":
    main()



Начинаем сбор данных о 100 покемонах...
  1 | bulbasaur    | grass/poison
  2 | ivysaur      | grass/poison
  3 | venusaur     | grass/poison
  4 | charmander   | fire/
  5 | charmeleon   | fire/
  6 | charizard    | fire/flying
  7 | squirtle     | water/
  8 | wartortle    | water/
  9 | blastoise    | water/
 10 | caterpie     | bug/
 11 | metapod      | bug/
 12 | butterfree   | bug/flying
 13 | weedle       | bug/poison
 14 | kakuna       | bug/poison
 15 | beedrill     | bug/poison
 16 | pidgey       | normal/flying
 17 | pidgeotto    | normal/flying
 18 | pidgeot      | normal/flying
 19 | rattata      | normal/
 20 | raticate     | normal/
 21 | spearow      | normal/flying
 22 | fearow       | normal/flying
 23 | ekans        | poison/
 24 | arbok        | poison/
 25 | pikachu      | electric/
 26 | raichu       | electric/
 27 | sandshrew    | ground/
 28 | sandslash    | ground/
 29 | nidoran-f    | poison/
 30 | nidorina     | poison/
 31 | nidoqueen    | poison/ground
 32

In [6]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta

# Наборы текста
# https://newsapi.org/

NEWS_API_KEY = "98a99dffe46c409e81bc6398aff29096"
NEWS_URL = "https://newsapi.org/v2/everything"
CATEGORIES = [
    'sports', 'technology', 'health',
    'business', 'science', 'politics',
    'music', 'environment', 'entertainment',
    'ai', 'cybersecurity',
    'crypto', 'gaming', 'space', 'fashion',
    'travel', 'food', 'books', 'wellness',
    'renewables', 'edtech', 'robotics', 'philanthropy'
]
DOMAINS = {
    'sports': 'espn.com,bbc.com/sport',
    'technology': 'techcrunch.com,engadget.com',
    'health': 'who.int,webmd.com',
    'business': 'reuters.com,bloomberg.com',
    'science': 'sciencemag.org,nature.com',
    'music': 'rollingstone.com, billboard.com, pitchfork.com, nme.com, spin.com',
    'politics': 'reuters.com/politics, politico.com, theguardian.com/world',
    'environment': 'ipcc.ch, grist.org, carbonbrief.org',
    'entertainment': 'variety.com, hollywoodreporter.com',
    'ai': 'syncedreview.com, arxiv.org, towardsdatascience.com',
    'cybersecurity': 'krebsonsecurity.com, therecord.media, darkreading.com',
    'crypto': 'coindesk.com, theblock.co, cointelegraph.com',
    'gaming': 'ign.com, polygon.com, eurogamer.net',
    'space': 'nasa.gov, spacex.com, skyandtelescope.org',
    'fashion': 'vogue.com, wwd.com, businessoffashion.com',
    'travel': 'cntraveler.com, lonelyplanet.com, skyradar.com, travelandleisure.com',
    'food': 'eater.com, bonappetit.com, foodandwine.com, theinfatuation.com',
    'books': 'nytimes.com/books, theguardian.com/books, lrb.co.uk, bookforum.com',
    'wellness': 'goop.com, mindbodygreen.com, well.blogs.nytimes.com, tinyhearts.com',
    'renewables': 'renewableenergyworld.com, greentechmedia.com, insideclimatenews.org',
    'edtech': 'edutopia.org, edsurge.com, timeshighereducation.com/edtech',
    'robotics': 'therobotreport.com, ieee.org/spectrum, robohub.org',
    'philanthropy': 'ssir.org, philanthropy.com, globalgiving.org'
}


def fetch_news_by_category(category, days=30, limit=50):
    from_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
    to_date = datetime.now().strftime('%Y-%m-%d')
    domain_filter = DOMAINS.get(category, "")

    params = {
        'q': category,
        'from': from_date,
        'to': to_date,
        'sortBy': 'publishedAt',
        'language': 'en',
        'pageSize': 100,
        'page': 1,
        'domains': domain_filter,
        'apiKey': NEWS_API_KEY
    }

    try:
        response = requests.get(NEWS_URL, params=params, timeout=15)
        response.raise_for_status()
        data = response.json()

        articles = []
        for item in data.get('articles', []):
            if len(articles) >= limit:
                break
            articles.append({
                'source_api': 'newsapi.org',
                'category': category,
                'source_id': item['source']['id'],
                'source_name': item['source']['name'],
                'author': item.get('author'),
                'title': item['title'],
                'description': item.get('description'),
                'url': item['url'],
                'image_url': item.get('urlToImage'),
                'published_at': item['publishedAt'],
                'content': item.get('content'),
                'collected_at': datetime.now().isoformat()
            })
        print(f"Получено {len(articles)} новостей по теме '{category}'")
        return articles
    except Exception as e:
        print(f"Ошибка при загрузке новостей ({category}): {e}")
        return []


print("Сбор текстовых данных (новости)...")
all_articles = []

for category in CATEGORIES:
    articles = fetch_news_by_category(category, days=30, limit=30)
    all_articles.extend(articles)
    time.sleep(1.5)

if all_articles:
    df_news = pd.DataFrame(all_articles)
    df_news['published_at'] = pd.to_datetime(df_news['published_at'])
    df_news.sort_values(by='published_at', ascending=False, inplace=True)

    df_news.to_csv('news_data.csv', index=False)
    print(f"Сохранено {len(df_news)} новостных записей в textual_news_data.csv")


Сбор текстовых данных (новости)...
Получено 30 новостей по теме 'sports'
Получено 30 новостей по теме 'technology'
Получено 7 новостей по теме 'health'
Получено 21 новостей по теме 'business'
Получено 1 новостей по теме 'science'
Получено 3 новостей по теме 'politics'
Получено 30 новостей по теме 'music'
Получено 17 новостей по теме 'environment'
Получено 30 новостей по теме 'entertainment'
Получено 16 новостей по теме 'ai'
Получено 1 новостей по теме 'cybersecurity'
Получено 30 новостей по теме 'crypto'
Получено 30 новостей по теме 'gaming'
Получено 30 новостей по теме 'space'
Получено 30 новостей по теме 'fashion'
Получено 11 новостей по теме 'travel'
Получено 30 новостей по теме 'food'
Получено 5 новостей по теме 'books'
Получено 30 новостей по теме 'wellness'
Получено 3 новостей по теме 'renewables'
Получено 1 новостей по теме 'edtech'
Получено 4 новостей по теме 'robotics'
Получено 7 новостей по теме 'philanthropy'
Сохранено 397 новостных записей в textual_news_data.csv


In [8]:
import pandas as pd
data1 = pd.read_csv("data/storm_data.csv", sep=",")
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32568 entries, 0 to 32567
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   time                        32568 non-null  object 
 1   temperature_2m              32568 non-null  float64
 2   wind_speed_10m              32568 non-null  float64
 3   wind_speed_80m              0 non-null      float64
 4   wind_speed_120m             0 non-null      float64
 5   wind_speed_180m             0 non-null      float64
 6   wind_direction_10m          32568 non-null  int64  
 7   wind_direction_80m          0 non-null      float64
 8   wind_direction_180m         0 non-null      float64
 9   wind_gusts_10m              32568 non-null  float64
 10  wind_direction_120m         0 non-null      float64
 11  precipitation               32568 non-null  float64
 12  relative_humidity_2m        32568 non-null  int64  
 13  pressure_msl                325

In [9]:
import pandas as pd
data2 = pd.read_csv("pokemon_data.csv", sep=",")
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   100 non-null    int64 
 1   name                 100 non-null    object
 2   base_experience      100 non-null    int64 
 3   height               100 non-null    int64 
 4   weight               100 non-null    int64 
 5   order                100 non-null    int64 
 6   type_1               100 non-null    object
 7   type_2               48 non-null     object
 8   hp                   100 non-null    int64 
 9   attack               100 non-null    int64 
 10  defense              100 non-null    int64 
 11  special_attack       100 non-null    int64 
 12  special_defense      100 non-null    int64 
 13  speed                100 non-null    int64 
 14  ability_1            100 non-null    object
 15  ability_2            66 non-null     object
 16  hidden_ab

In [10]:
import pandas as pd
data3 = pd.read_csv("news_data.csv", sep=",")
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   source_api    397 non-null    object
 1   category      397 non-null    object
 2   source_id     106 non-null    object
 3   source_name   397 non-null    object
 4   author        386 non-null    object
 5   title         397 non-null    object
 6   description   395 non-null    object
 7   url           397 non-null    object
 8   image_url     397 non-null    object
 9   published_at  397 non-null    object
 10  content       397 non-null    object
 11  collected_at  397 non-null    object
dtypes: object(12)
memory usage: 37.3+ KB


In [11]:
data1

Unnamed: 0,time,temperature_2m,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_speed_180m,wind_direction_10m,wind_direction_80m,wind_direction_180m,wind_gusts_10m,...,cloud_cover_low,cloud_cover_mid,cloud_cover_high,rain,showers,vapour_pressure_deficit,apparent_temperature,dew_point_2m,et0_fao_evapotranspiration,is_storm
0,2022-01-01 00:00:00,-2.6,17.0,,,,216,,,23.8,...,100,100,0,0.0,0.0,0.04,-7.7,-3.7,0.00,1
1,2022-01-01 01:00:00,-2.1,16.5,,,,224,,,23.8,...,100,100,0,0.0,0.0,0.04,-7.1,-3.1,0.00,1
2,2022-01-01 02:00:00,-1.7,14.9,,,,233,,,22.7,...,100,100,0,0.0,0.0,0.03,-6.3,-2.6,0.00,1
3,2022-01-01 03:00:00,-2.1,14.7,,,,248,,,20.2,...,100,100,100,0.0,0.0,0.04,-6.8,-3.1,0.00,1
4,2022-01-01 04:00:00,-2.0,14.1,,,,251,,,19.8,...,100,100,100,0.0,0.0,0.03,-6.6,-2.8,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32563,2025-09-18 19:00:00,16.7,9.4,,,,183,,,24.8,...,0,93,82,0.0,0.0,0.84,14.8,7.9,0.06,1
32564,2025-09-18 20:00:00,15.9,8.1,,,,197,,,19.4,...,0,100,91,0.0,0.0,0.70,14.3,8.4,0.03,1
32565,2025-09-18 21:00:00,14.8,8.6,,,,255,,,18.0,...,1,100,91,0.0,0.0,0.52,13.3,9.2,0.02,1
32566,2025-09-18 22:00:00,13.7,7.6,,,,265,,,18.4,...,4,100,93,0.0,0.0,0.37,12.5,9.6,0.01,1


In [14]:
data1.isna().sum()

Unnamed: 0,0
time,0
temperature_2m,0
wind_speed_10m,0
wind_speed_80m,32568
wind_speed_120m,32568
wind_speed_180m,32568
wind_direction_10m,0
wind_direction_80m,32568
wind_direction_180m,32568
wind_gusts_10m,0


In [12]:
data2

Unnamed: 0,id,name,base_experience,height,weight,order,type_1,type_2,hp,attack,...,habitat,shape,is_legendary,is_mythical,capture_rate,base_happiness,growth_rate,egg_groups,evolution_chain,encounter_locations
0,1,bulbasaur,64,7,69,1,grass,poison,45,49,...,grassland,quadruped,0,0,45,70,medium-slow,"monster, plant",bulbasaur -> ivysaur -> venusaur,"alola-route-2-main, cerulean-city-area, lumios..."
1,2,ivysaur,142,10,130,2,grass,poison,60,62,...,grassland,quadruped,0,0,45,70,medium-slow,"monster, plant",bulbasaur -> ivysaur -> venusaur,
2,3,venusaur,236,20,1000,3,grass,poison,80,82,...,grassland,quadruped,0,0,45,70,medium-slow,"monster, plant",bulbasaur -> ivysaur -> venusaur,
3,4,charmander,62,6,85,5,fire,,39,52,...,mountain,upright,0,0,45,70,medium-slow,"monster, dragon",charmander -> charmeleon -> charizard,"alola-route-3-main, kanto-route-24-area, lumio..."
4,5,charmeleon,142,11,190,6,fire,,58,64,...,mountain,upright,0,0,45,70,medium-slow,"monster, dragon",charmander -> charmeleon -> charizard,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,drowzee,66,10,324,154,psychic,,60,48,...,grassland,humanoid,0,0,190,70,medium,humanshape,drowzee -> hypno,"alola-route-2-north, alola-route-2-south, berr..."
96,97,hypno,169,16,756,155,psychic,,85,73,...,grassland,humanoid,0,0,75,70,medium,humanshape,drowzee -> hypno,"berry-forest-area, cerulean-cave-1f, hauoli-ci..."
97,98,krabby,65,4,65,156,water,,30,105,...,waters-edge,armor,0,0,225,70,medium,water3,krabby -> kingler,"bond-bridge-area, cerulean-city-area, cherrygr..."
98,99,kingler,166,13,600,157,water,,55,130,...,waters-edge,armor,0,0,60,70,medium,water3,krabby -> kingler,"bond-bridge-area, cerulean-cave-1f, cerulean-c..."


In [15]:
data2.isna().sum()

Unnamed: 0,0
id,0
name,0
base_experience,0
height,0
weight,0
order,0
type_1,0
type_2,52
hp,0
attack,0


In [13]:
data3

Unnamed: 0,source_api,category,source_id,source_name,author,title,description,url,image_url,published_at,content,collected_at
0,newsapi.org,crypto,,CoinDesk,Omkar Godbole,"Bitcoin Longs on Bitfinex Jump 20%, Prices Dro...",BTC/USD longs on Bitfinex frequently move inve...,https://www.coindesk.com/markets/2025/09/22/bi...,https://cdn.sanity.io/images/s3y3vcno/producti...,2025-09-22 06:35:56+00:00,"Bullish bitcoin BTC\r\n$112,786.27 bets on Bit...",2025-09-23T06:42:59.195529
1,newsapi.org,fashion,,WWD,Luisa Zargani,EXCLUSIVE: Demna Redefines ‘Gucciness’: Inside...,The designer opened up about his reinterpretat...,http://wwd.com/fashion-news/designer-luxury/in...,https://wwd.com/wp-content/uploads/2025/09/dem...,2025-09-22 06:00:00+00:00,MILAN — “I’m a fighter and I have to prove thi...,2025-09-23T06:43:04.351778
2,newsapi.org,crypto,,Cointelegraph,Cointelegraph by Stephen Katte,OKX built a perps DEX but held off due to regu...,OKX founder and CEO Star Xu cited the CFTC enf...,https://cointelegraph.com/news/okx-delays-onch...,https://images.cointelegraph.com/cdn-cgi/image...,2025-09-22 05:57:40+00:00,Crypto exchange OKX built a decentralized perp...,2025-09-23T06:42:59.195554
3,newsapi.org,entertainment,,Variety,John Hopewell,Film Factory Boards Director Daniel Monzón’s ‘...,Spain’s Film Factory Entertainment has picked ...,https://variety.com/2025/film/global/film-fact...,https://variety.com/wp-content/uploads/2025/09...,2025-09-22 05:53:20+00:00,Spain’s Film Factory Entertainment has picked ...,2025-09-23T06:42:54.277295
4,newsapi.org,crypto,,Cointelegraph,Cointelegraph by Martin Young,‘Uptober’ rally questioned as crypto markets t...,Bitcoin dropped to 12-day lows on Monday despi...,https://cointelegraph.com/news/crypto-analysts...,https://images.cointelegraph.com/cdn-cgi/image...,2025-09-22 05:39:54+00:00,Crypto pundits are debating whether there will...,2025-09-23T06:42:59.195557
...,...,...,...,...,...,...,...,...,...,...,...,...
392,newsapi.org,business,bloomberg,Bloomberg,Bloomberg,AI Disruption fear sparks investor scrutiny of...,While AI threatens to disrupt industries as di...,https://www.bloomberg.com/news/articles/2025-0...,https://bl-i.thgim.com/public/incoming/nq1kv3/...,2025-08-25 11:11:59+00:00,For years software companies were the toast of...,2025-09-23T06:42:46.002172
393,newsapi.org,environment,,Grist,Miacel Spotted Elk,"‘Alligator Alcatraz’ must close, but the fight...",A judge sided with the Miccosukee Tribe and sa...,https://grist.org/indigenous/alligator-alcatra...,https://grist.org/wp-content/uploads/2025/08/a...,2025-08-25 08:45:00+00:00,The Miccosukee Tribe makes its home in the Flo...,2025-09-23T06:42:52.548286
394,newsapi.org,environment,,Grist,Miacel Spotted Elk,"Alligator Alcatraz must close, but the fight i...",A judge sided with the Miccosukee Tribe and sa...,https://grist.org/article/alligator-alcatraz-m...,https://grist.org/wp-content/uploads/2025/08/a...,2025-08-25 08:45:00+00:00,The Miccosukee Tribe makes its home in the Eve...,2025-09-23T06:42:52.548289
395,newsapi.org,ai,,Arxiv.org,"Fan Nie, Ken Ziyu Liu, Zihao Wang, Rui Sun, We...",UQ: Assessing Language Models on Unsolved Ques...,Benchmarks shape progress in AI research. A us...,https://arxiv.org/abs/2508.17580,https://arxiv.org/static/browse/0.3.4/images/a...,2025-08-25 01:07:59+00:00,arXivLabs is a framework that allows collabora...,2025-09-23T06:42:55.889624


In [16]:
data3.isna().sum()

Unnamed: 0,0
source_api,0
category,0
source_id,291
source_name,0
author,11
title,0
description,2
url,0
image_url,0
published_at,0
