## Install yfinance

In [2]:
!pip install yfinance pymongo pandas

Collecting yfinance
  Downloading yfinance-1.0-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting pymongo
  Downloading pymongo-4.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (10.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.19.0-py3-none-any.whl.metadata (7.0 kB)
Collecting curl_cffi<0.14,>=0.7 (from yfinance)
  Downloading curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting websockets>=13.0 (from yfinance)
  Downloading websockets-16.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (6.8 kB)
Collecting dnspython<3.0.0,>=2.6.1 (from pymongo)
  Downloading dnspython-2.8.0-py3-no

## Gather data from yfinance


In [2]:
import yfinance as yf
from pymongo import MongoClient
import pandas as pd
import os

# 1. Połączenie z MongoDB (używamy nazwy usługi z docker-compose)
client = MongoClient("mongodb://mongodb:27017/") 
db = client.stock_database
collection = db.prices

# Czyścimy bazę przed nowym pobraniem (opcjonalnie)
collection.delete_many({})

assets = ['AAPL', 'NVDA', 'TSLA', 'BTC-USD', 'ETH-USD', 'SOL-USD']

# Tworzymy folder na JSONy jeśli nie istnieje
if not os.path.exists('data'):
    os.makedirs('data')

for asset in assets:
    print(f"Pobieram: {asset}...")
    
    # Pobieramy dane (skoro mamy styczeń 2026, bierzemy pełny 2025)
    ticker = yf.Ticker(asset)
    df = ticker.history(start="2025-01-01", end="2025-12-31")
    
    # Obróbka danych
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['asset'] = asset
    
    # Zapis do MongoDB
    records = df.to_dict('records')
    if records:
        collection.insert_many(records)
        
        # Zapis do JSON w folderze work/data
        df.to_json(f"data/{asset}_2025.json", orient='records', indent=4)

print("\n--- GOTOWE! Dane są w MongoDB i folderze data ---")

# Wyświetlmy próbkę danych z bazy dla testu
sample = pd.DataFrame(list(collection.find({'asset': 'AAPL'}).limit(5)))
sample

Pobieram: AAPL...
Pobieram: NVDA...
Pobieram: TSLA...
Pobieram: BTC-USD...
Pobieram: ETH-USD...
Pobieram: SOL-USD...

--- GOTOWE! Dane są w MongoDB i folderze data ---


Unnamed: 0,_id,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,asset
0,6963dc8cd98665a62ee5a2ee,2025-01-02,247.80922,247.978468,240.731247,242.752106,55740700,0.0,0.0,AAPL
1,6963dc8cd98665a62ee5a2ef,2025-01-03,242.264297,243.080598,240.800915,242.264297,40244100,0.0,0.0,AAPL
2,6963dc8cd98665a62ee5a2f0,2025-01-06,243.210016,246.216423,242.105013,243.896912,45045600,0.0,0.0,AAPL
3,6963dc8cd98665a62ee5a2f1,2025-01-07,241.886014,244.44445,240.263363,241.119492,40856000,0.0,0.0,AAPL
4,6963dc8cd98665a62ee5a2f2,2025-01-08,240.830782,242.612732,238.969207,241.607269,37628900,0.0,0.0,AAPL


## Test Mongo data storage

In [4]:
from pymongo import MongoClient
import pandas as pd

# Połączenie
client = MongoClient("mongodb://mongodb:27017/")
db = client.stock_database
collection = db.prices

# 1. Sprawdźmy ile w ogóle mamy dokumentów w bazie
total = collection.count_documents({})
print(f"Łączna liczba wpisów w bazie: {total}")

# 2. Sprawdźmy jakie mamy instrumenty (assets)
assets_in_db = collection.distinct("asset")
print(f"Instrumenty w bazie: {assets_in_db}")

# 3. Wyciągnijmy dane dla konkretnego instrumentu (np. NVDA) i wrzućmy do tabeli
df_nvda = pd.DataFrame(list(collection.find({"asset": "NVDA"}).limit(10)))
df_nvda

Łączna liczba wpisów w bazie: 31283
Instrumenty w bazie: ['AAPL', 'BTC-USD', 'ETH-USD', 'NVDA', 'SOL-USD', 'TSLA']


Unnamed: 0,_id,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,asset
0,69650a8d500ae1482eba0cd8,1999-01-22,0.040112,0.044767,0.035575,0.037605,2714688000,0.0,0.0,NVDA
1,69650a8d500ae1482eba0cd9,1999-01-25,0.040589,0.042021,0.037605,0.041545,510480000,0.0,0.0,NVDA
2,69650a8d500ae1482eba0cda,1999-01-26,0.042021,0.042857,0.037724,0.038321,343200000,0.0,0.0,NVDA
3,69650a8d500ae1482eba0cdb,1999-01-27,0.03844,0.039395,0.036291,0.038202,244368000,0.0,0.0,NVDA
4,69650a8d500ae1482eba0cdc,1999-01-28,0.038202,0.03844,0.037843,0.038082,227520000,0.0,0.0,NVDA
5,69650a8d500ae1482eba0cdd,1999-01-29,0.038082,0.038202,0.036291,0.036291,244032000,0.0,0.0,NVDA
6,69650a8d500ae1482eba0cde,1999-02-01,0.036291,0.037246,0.036291,0.037008,154704000,0.0,0.0,NVDA
7,69650a8d500ae1482eba0cdf,1999-02-02,0.036291,0.037246,0.033068,0.034143,264096000,0.0,0.0,NVDA
8,69650a8d500ae1482eba0ce0,1999-02-03,0.033665,0.035337,0.033426,0.034859,75120000,0.0,0.0,NVDA
9,69650a8d500ae1482eba0ce1,1999-02-04,0.035337,0.037724,0.034859,0.036769,181920000,0.0,0.0,NVDA


## Ingest full BTC data

In [3]:
import yfinance as yf
from pymongo import MongoClient
import pandas as pd
import os

# 1. Połączenie z MongoDB
client = MongoClient("mongodb://mongodb:27017/") 
db = client.stock_database
collection = db.prices

# Czyścimy bazę przed pobraniem kompletu danych
collection.delete_many({})

assets = ['AAPL', 'NVDA', 'TSLA', 'BTC-USD', 'ETH-USD', 'SOL-USD']

if not os.path.exists('data'):
    os.makedirs('data')

for asset in assets:
    print(f"Pobieram pełną historię dla: {asset}...")
    
    ticker = yf.Ticker(asset)
    
    # Używamy period="max" zamiast sztywnych dat start/end
    df = ticker.history(period="max")
    
    if df.empty:
        print(f"Brak danych dla {asset}")
        continue

    # Obróbka danych
    df.reset_index(inplace=True)
    
    # Wyciągamy datę startową dla informacji
    first_date = df['Date'].min().strftime('%Y-%m-%d')
    last_date = df['Date'].max().strftime('%Y-%m-%d')
    print(f"Dostępne dane dla {asset}: od {first_date} do {last_date} (rekordów: {len(df)})")

    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['asset'] = asset
    
    # Zapis do MongoDB
    records = df.to_dict('records')
    if records:
        collection.insert_many(records)
        
        # Zapis do JSON z informacją o pełnym zakresie
        df.to_json(f"data/{asset}_full_history.json", orient='records', indent=4)

print("\n--- GOTOWE! Pełna historia jest w MongoDB i folderze data ---")

# Sprawdźmy ile łącznie dokumentów mamy w bazie
total_docs = collection.count_documents({})
print(f"Łączna liczba rekordów w bazie: {total_docs}")

Pobieram pełną historię dla: AAPL...
Dostępne dane dla AAPL: od 1980-12-12 do 2026-01-12 (rekordów: 11362)
Pobieram pełną historię dla: NVDA...
Dostępne dane dla NVDA: od 1999-01-22 do 2026-01-12 (rekordów: 6785)
Pobieram pełną historię dla: TSLA...
Dostępne dane dla TSLA: od 2010-06-29 do 2026-01-12 (rekordów: 3909)
Pobieram pełną historię dla: BTC-USD...
Dostępne dane dla BTC-USD: od 2014-09-17 do 2026-01-12 (rekordów: 4136)
Pobieram pełną historię dla: ETH-USD...
Dostępne dane dla ETH-USD: od 2017-11-09 do 2026-01-12 (rekordów: 2987)
Pobieram pełną historię dla: SOL-USD...
Dostępne dane dla SOL-USD: od 2020-04-10 do 2026-01-12 (rekordów: 2104)

--- GOTOWE! Pełna historia jest w MongoDB i folderze data ---
Łączna liczba rekordów w bazie: 31283
