# Zadanie 1

In [None]:
import tarfile
import os

tar_gz_file = '/content/Zookeeper.tar.gz'
output_dir = '/content/'

os.makedirs(output_dir, exist_ok=True)

with tarfile.open(tar_gz_file, 'r:gz') as tar:
    tar.extractall(path=output_dir)

In [None]:
import dask
from dask import delayed
import dask.dataframe as dd
import pandas as pd
from datetime import datetime

log_file = '/content/Zookeeper.log'

def parse_line(line):
    record = {}

    parts = line.split(' - ')
    if len(parts) < 3:
        return None

    datetime_part = parts[0]
    level_part = parts[1].strip()
    context_part = parts[2].strip()
    message_part = ' - '.join(parts[3:]).strip()

    record["date"] = datetime_part
    record["level"] = level_part
    record["context"] = context_part
    record["message"] = message_part

    return record

def convert_date(record):
    try:
        record["date"] = datetime.strptime(record["date"], "%Y-%m-%d %H:%M:%S,%f")
    except ValueError as e:
        print(f"Error parsing date: {record['date']}. Error: {e}")
        record["date"] = None
    return record

@delayed
def process_log_file(log_file):
    output = []
    with open(log_file, 'r') as file:
        for line in file:
            record = parse_line(line)
            if record:
                record = convert_date(record)
                output.append(record)
    return output

log_records = process_log_file(log_file)
log_records_computed = log_records.compute()

df = pd.DataFrame(log_records_computed, columns=["date", "level", "context", "message"])
print(df.head(20))

df_dask = dd.from_pandas(df, npartitions=10)
df_dask.to_parquet('logs.parquet')

                      date                                              level  \
0  2015-07-29 17:41:41.536                  INFO  [main:QuorumPeerConfig@101]   
1  2015-07-29 17:41:41.544                  INFO  [main:QuorumPeerConfig@334]   
2  2015-07-29 17:41:41.555              INFO  [main:DatadirCleanupManager@78]   
3  2015-07-29 17:41:41.555              INFO  [main:DatadirCleanupManager@79]   
4  2015-07-29 17:41:41.557             INFO  [main:DatadirCleanupManager@101]   
5  2015-07-29 17:41:41.579                    INFO  [main:QuorumPeerMain@127]   
6  2015-07-29 17:41:41.609               INFO  [main:NIOServerCnxnFactory@94]   
7  2015-07-29 17:41:41.648                        INFO  [main:QuorumPeer@913]   
8  2015-07-29 17:41:41.649                        INFO  [main:QuorumPeer@933]   
9  2015-07-29 17:41:41.649                        INFO  [main:QuorumPeer@944]   
10 2015-07-29 17:41:41.649                        INFO  [main:QuorumPeer@959]   
11 2015-07-29 17:41:41.680  

# Zadanie 2

In [None]:
!pip install mimesis

Collecting mimesis
  Downloading mimesis-18.0.0-py3-none-any.whl.metadata (5.7 kB)
Downloading mimesis-18.0.0-py3-none-any.whl (4.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/4.7 MB[0m [31m4.0 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/4.7 MB[0m [31m15.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/4.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.7/4.7 MB[0m [31m34.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mimesis
Successfully installed mimesis-18.0.0


In [56]:
import dask.bag as db
import json
import os
from dask.datasets import make_people

people_bag = make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en")

def is_credit_card_expired(record):
    expiration_date = record.get("credit-card", {}).get("expiration-date", "")
    if expiration_date:
        month, year = map(int, expiration_date.split("/"))
        return (year < 2024) or (year == 2024 and month < 10)
    return False

expired_records = people_bag.filter(is_credit_card_expired)

output_path = '/content/expired_*.json'
expired_records.map(json.dumps).to_textfiles(output_path)

for i in range(10):
    file_path = f'/content/expired_{i}.json'
    if os.path.exists(file_path):
        print(f"Contents of {file_path}:")
        with open(file_path, 'r') as f:
            for _ in range(5):
                print(json.loads(f.readline().strip()))
    else:
        print(f'File {file_path} does not exist.')

Contents of /content/expired_0.json:
{'age': 45, 'name': ['Yasuko', 'Riggs'], 'occupation': 'Illustrator', 'telephone': '+14587921982', 'address': {'address': '721 Warner Garden', 'city': 'Evergreen Park'}, 'credit-card': {'number': '2561 3664 2094 4771', 'expiration-date': '04/18'}}
{'age': 85, 'name': ['Hee', 'Chambers'], 'occupation': 'Television Producer', 'telephone': '+14790937260', 'address': {'address': '867 Perry Junction', 'city': 'Weirton'}, 'credit-card': {'number': '5565 6050 2320 5430', 'expiration-date': '11/20'}}
{'age': 91, 'name': ['Kittie', 'Cote'], 'occupation': 'Paint Consultant', 'telephone': '+1-831-057-9514', 'address': {'address': '304 Acorn Lake', 'city': 'Fountain Valley'}, 'credit-card': {'number': '3741 581923 69761', 'expiration-date': '08/22'}}
{'age': 115, 'name': ['Rosana', 'Juarez'], 'occupation': 'Sheriff', 'telephone': '+1-903-161-7259', 'address': {'address': '913 Rosewood Garden', 'city': 'Conroe'}, 'credit-card': {'number': '5477 1136 4190 3475', 

# Zadanie 3

In [60]:
import dask.bag as db
import dask.dataframe as dd
from dask.datasets import make_people

people_bag = make_people(npartitions=1, records_per_partition=1000, seed=None, locale="en")

adult_records = people_bag.filter(lambda record: record['age'] >= 18)

adult_df = adult_records.to_dataframe()

output_path = '/content/adult_people.parquet'
adult_df.to_parquet(output_path)

loaded_df = dd.read_parquet(output_path)
print("Loaded DataFrame:")
print(loaded_df.head())

Loaded DataFrame:
   age                    name         occupation        telephone  \
0   71  ('Mitchell', 'Barber')     Telex Operator  +1-831-448-3636   
1   52      ('Jordan', 'Rose')    Marine Surveyor  +1-573-094-7354   
2   64   ('Hilma', 'Williams')              Vicar  +1-802-254-3801   
3   22      ('Minna', 'Berry')         Magistrate  +1-305-890-0447   
4   32   ('Marilou', 'Garner')  Aircraft Designer  +1-510-333-3512   

                                             address  \
0  {'address': '1140 Alpine Ranch', 'city': 'Asht...   
1  {'address': '619 Pioche Ferry', 'city': 'West ...   
2  {'address': '1391 Elizabeth Hill', 'city': 'Sh...   
3  {'address': '1007 Wagner Center', 'city': 'Myr...   
4  {'address': '784 Hillside Cove', 'city': 'Palm...   

                                         credit-card  
0  {'number': '3460 824746 25978', 'expiration-da...  
1  {'number': '4228 6154 7103 5744', 'expiration-...  
2  {'number': '4644 0253 8504 8575', 'expiration-...  
3  {