In [2]:
import requests
import pandas as pd
from datetime import timedelta
from datetime import datetime

from airflow import DAG
from airflow.operators.python import PythonOperator

TOP_1M_DOMAINS = 'https://storage.yandexcloud.net/kc-startda/top-1m.csv'
TOP_1M_DOMAINS_FILE = 'top-1m.csv'


def get_data():
    # Здесь пока оставили запись в файл, как передавать переменую между тасками будет в третьем уроке
    top_doms = pd.read_csv(TOP_1M_DOMAINS)
    top_data = top_doms.to_csv(index=False)

    with open(TOP_1M_DOMAINS_FILE, 'w') as f:
        f.write(top_data)
        
def get_top_10_zone():
    top_data_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    top_data_df['domain_zone'] = top_data_df['domain'].str.split('.').str[-1]   
    top_10_doms = top_data_df.groupby('domain_zone').agg({'rank' : 'count'}).sort_values('rank', ascending = False)
    top_10_doms.rename(columns = {'rank' : 'count'}, inplace = True)
    top_10_doms.head(10)
    with open('top_10_doms.csv', 'w') as f:
        f.write(top_10_doms.to_csv(index=False, header=False))

def get_longest_domain():
    top_data_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    longest_domain = top_data_df.copy()
    longest_domain['domain'].astype('str')
    longest_domain['domain_length'] = longest_domain['domain'].apply(lambda x: len(x))
    longest_domain.sort_values('domain_length', ascending = False).head(1)
    with open('longest_domain.csv', 'w') as f:
        f.write(longest_domain.to_csv(index=False, header=False))
        
def airflow_place():
    top_data_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    airflow = top_data_df.query("domain == 'airflow.com'")
    with open('airflow.csv', 'w') as f:
        f.write(airflow.to_csv(index=False, header=False))  
    
def print_data(ds):
    with open('top_10_doms.csv', 'r') as f:
        all_data_top_10_doms = f.read()
    with open('longest_domain.csv', 'r') as f:
        all_data_longest_domain = f.read()
    with open('airflow.csv', 'r') as f:
        all_data_airflow = f.read()
    date = ds

    print(f'Top 10 domains for date {date}')
    print(all_data_top_10_doms)

    print(f'The longest name of domain for date {date}')
    print(all_data_longest_domain)
    
    print(f'Airflow domain place for date {date}')
    print(all_data_airflow)
       

default_args = {
    'owner': 'v-tsuzoj',
    'depends_on_past': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2024, 3, 5),
}
schedule_interval = '0 4 * * *'


dag = DAG('v_tsuzoj_lesson_2', default_args=default_args, schedule_interval=schedule_interval)


t1 = PythonOperator(task_id='get_data',
                    python_callable=get_data,
                    dag=dag)

t2 = PythonOperator(task_id='get_top_10_zone',
                    python_callable=get_top_10_zone,
                    dag=dag)

t3 = PythonOperator(task_id='get_longest_domain',
                        python_callable=get_longest_domain,
                        dag=dag)

t4 = PythonOperator(task_id='airflow_place',
                    python_callable=airflow_place,
                    dag=dag)

t5 = PythonOperator(task_id='print_data',
                    python_callable=print_data,
                    dag=dag)

t1 >> [t2, t3, t4] >> t5

<Task(PythonOperator): print_data>