## Шаг 1. Развертывание MinIO

**Параметры подключения:**
- **Веб-интерфейс (Console):** http://localhost:9001
- **API Endpoint:** http://localhost:9000
- **Access Key:** minioadmin
- **Secret Key:** minioadmin123

## Шаг 2. Установка зависимостей и подключение к MinIO

In [1]:
!pip install minio pandas pyarrow --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from minio import Minio
from minio.error import S3Error
import pandas as pd
import io
from datetime import datetime

In [3]:
MINIO_ENDPOINT = "localhost:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
MINIO_SECURE = False

client = Minio(
    endpoint=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY,
    secure=MINIO_SECURE
)

In [4]:
try:
    buckets = client.list_buckets()
    print("Connected")
    print(f"\nBuckets: ({len(buckets)})")
    if buckets:
        for bucket in buckets:
            print(f"- {bucket.name} ({bucket.creation_date})")
    else:
        print("No buckets found")
except S3Error as e:
    print(f"Error connecting to MinIO: {e}")

Connected

Buckets: (1)
- hw-litvinov (2025-12-18 07:41:11.910000+00:00)


## Шаг 3. Создание бакета

In [5]:
BUCKET_NAME = "hw-litvinov"

try:
    if not client.bucket_exists(BUCKET_NAME):
        client.make_bucket(BUCKET_NAME)
        print(f"Bucket '{BUCKET_NAME}' created successfully")
    else:
        print(f"Bucket '{BUCKET_NAME}' already exists")
except S3Error as e:
    print(f"Error creating bucket: {e}")

Bucket 'hw-litvinov' already exists


In [6]:
buckets = client.list_buckets()
print(f"Bucket list: ({len(buckets)})")
for bucket in buckets:
    marker = "-" if bucket.name == BUCKET_NAME else " "
    print(f"{marker} {bucket.name}")

Bucket list: (1)
- hw-litvinov


## Шаг 4. Создание набора данных

In [7]:
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Иванов Иван", "Петрова Мария", "Сидоров Алексей", "Козлова Анна", "Смирнов Дмитрий"],
    "course": ["Data Engineering", "Machine Learning", "Data Engineering", "DevOps", "Machine Learning"],
    "score": [85.5, 92.0, 78.3, 88.7, 95.2],
    "enrollment_date": ["2025-12-01", "2025-12-01", "2025-12-15", "2025-12-01", "2025-12-01"],
    "is_active": [True, True, True, False, True]
}

df = pd.DataFrame(data)

print("DataFrame:")
print(f"\nShape: {df.shape[0]} rows x {df.shape[1]} columns")
print("\nStructure:")
print(df.dtypes)
print("\nContents:")
df

DataFrame:

Shape: 5 rows x 6 columns

Structure:
id                   int64
name                object
course              object
score              float64
enrollment_date     object
is_active             bool
dtype: object

Contents:


Unnamed: 0,id,name,course,score,enrollment_date,is_active
0,1,Иванов Иван,Data Engineering,85.5,2025-12-01,True
1,2,Петрова Мария,Machine Learning,92.0,2025-12-01,True
2,3,Сидоров Алексей,Data Engineering,78.3,2025-12-15,True
3,4,Козлова Анна,DevOps,88.7,2025-12-01,False
4,5,Смирнов Дмитрий,Machine Learning,95.2,2025-12-01,True


## Шаг 5. Сохранение данных в MinIO

### 5.1 Сохранение в формате CSV

In [8]:
csv_filename = "students_data.csv"

csv_buffer = io.BytesIO()
df.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_buffer.seek(0)

csv_size = len(csv_buffer.getvalue())

try:
    client.put_object(
        bucket_name=BUCKET_NAME,
        object_name=csv_filename,
        data=csv_buffer,
        length=csv_size,
        content_type='text/csv'
    )
    print(f"File '{csv_filename}' loaded successfully and uploaded into '{BUCKET_NAME}'!")
    print(f"File size: {csv_size} byte")
except S3Error as e:
    print(f"Error loading file: {e}")

File 'students_data.csv' loaded successfully and uploaded into 'hw-litvinov'!
File size: 370 byte


### 5.2 Сохранение в формате Parquet

In [9]:
parquet_filename = "students_data.parquet"

parquet_buffer = io.BytesIO()
df.to_parquet(parquet_buffer, index=False, engine='pyarrow')
parquet_buffer.seek(0)

parquet_size = len(parquet_buffer.getvalue())

try:
    client.put_object(
        bucket_name=BUCKET_NAME,
        object_name=parquet_filename,
        data=parquet_buffer,
        length=parquet_size,
        content_type='application/octet-stream'
    )
    print(f"File '{parquet_filename}' loaded successfully into '{BUCKET_NAME}'!")
    print(f"File size: {parquet_size} byte")
except S3Error as e:
    print(f"Error loading file: {e}")

File 'students_data.parquet' loaded successfully into 'hw-litvinov'!
File size: 4545 byte


In [10]:
print(f"Bucket contents '{BUCKET_NAME}':")
print("-" * 60)

objects = client.list_objects(BUCKET_NAME)
for obj in objects:
    print(f"{obj.object_name:30} | Size: {obj.size:>8} byte | Modified: {obj.last_modified}")

Bucket contents 'hw-litvinov':
------------------------------------------------------------
students_data.csv              | Size:      370 byte | Modified: 2025-12-18 08:27:30.684000+00:00
students_data.parquet          | Size:     4545 byte | Modified: 2025-12-18 08:27:30.721000+00:00


## Шаг 6. Скачивание и чтение данных из MinIO

### 6.1 Чтение CSV файла

In [11]:
try:
    response = client.get_object(BUCKET_NAME, csv_filename)
    
    df_from_csv = pd.read_csv(io.BytesIO(response.read()))
    response.close()
    response.release_conn()
    
    print(f"File '{csv_filename}' downloaded and loaded successfully")
    print(f"\nCSV data: ({df_from_csv.shape[0]} rows x {df_from_csv.shape[1]} columns):")
    display(df_from_csv)
    
except S3Error as e:
    print(f"Error downloading file: {e}")

File 'students_data.csv' downloaded and loaded successfully

CSV data: (5 rows x 6 columns):


Unnamed: 0,id,name,course,score,enrollment_date,is_active
0,1,Иванов Иван,Data Engineering,85.5,2025-12-01,True
1,2,Петрова Мария,Machine Learning,92.0,2025-12-01,True
2,3,Сидоров Алексей,Data Engineering,78.3,2025-12-15,True
3,4,Козлова Анна,DevOps,88.7,2025-12-01,False
4,5,Смирнов Дмитрий,Machine Learning,95.2,2025-12-01,True


### 6.2 Чтение Parquet файла

In [12]:
try:
    response = client.get_object(BUCKET_NAME, parquet_filename)
    
    df_from_parquet = pd.read_parquet(io.BytesIO(response.read()))
    response.close()
    response.release_conn()
    
    print(f"File '{parquet_filename}' downloaded and loaded successfully")
    print(f"\nParquet data ({df_from_parquet.shape[0]} rows x {df_from_parquet.shape[1]} columns):")
    display(df_from_parquet)
    
except S3Error as e:
    print(f"Error downloading file: {e}")

File 'students_data.parquet' downloaded and loaded successfully

Parquet data (5 rows x 6 columns):


Unnamed: 0,id,name,course,score,enrollment_date,is_active
0,1,Иванов Иван,Data Engineering,85.5,2025-12-01,True
1,2,Петрова Мария,Machine Learning,92.0,2025-12-01,True
2,3,Сидоров Алексей,Data Engineering,78.3,2025-12-15,True
3,4,Козлова Анна,DevOps,88.7,2025-12-01,False
4,5,Смирнов Дмитрий,Machine Learning,95.2,2025-12-01,True


### 6.3 Проверка целостности данных

In [13]:
print("Checking data integrity:")
print("-" * 50)

csv_match = df.reset_index(drop=True).equals(df_from_csv.reset_index(drop=True))
print(f"CSV: {'Data matches' if csv_match else 'Data does not match'}")

parquet_match = df.reset_index(drop=True).equals(df_from_parquet.reset_index(drop=True))
print(f"Parquet: {'Data matches' if parquet_match else 'Data does not match'}")

print("-" * 50)
if csv_match and parquet_match:
    print("\nAll data saved and matched successfully")
else:
    print("\nError Data Integrity")

Checking data integrity:
--------------------------------------------------
CSV: Data matches
Parquet: Data matches
--------------------------------------------------

All data saved and matched successfully
