In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pymysql

spark = SparkSession.builder.appName("cf_etl").getOrCreate()

### MySQL and Spark Connection

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

DB_USERNAME = os.getenv('DB_USERNAME')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')

def table_df(schema_name, table_name):
    url = f"jdbc:mysql://{DB_HOST}:{DB_PORT}/{schema_name}"
    properties = {
        "user": DB_USERNAME,
        "password": DB_PASSWORD,
        "driver": "com.mysql.cj.jdbc.Driver"
    }
    df = spark.read.jdbc(url=url, table=table_name, properties=properties)
    return df

In [3]:
df = table_df('config_db','cf_etl_table')

### Update Config Table

In [4]:
def update_date_on_config_table(schema_name, table_name, index):

    pymysql_connection = pymysql.connect(
        host= DB_HOST,
        user= DB_USERNAME,
        password=DB_PASSWORD,
        database= schema_name
    )

    with pymysql_connection.cursor() as cursor:
        exec_date_query = f"update `{schema_name}`.{table_name} set execution_date = (current_timestamp) where id = {index+1}"
        cursor.execute(exec_date_query)

        update_startdate_query = f"update `{schema_name}`.{table_name}  set start_date_time = date_add(start_date_time, interval 1 day)"
        cursor.execute(update_startdate_query)

        update_enddate_query = f"update `{schema_name}`.{table_name}  set end_date_time = date_add(end_date_time, interval 1 day)"
        cursor.execute(update_enddate_query)
        
        pymysql_connection.commit()

### Uploading File to HDFS

In [5]:
def upload():

    df = table_df('config_db','cf_etl_table')

    for i, row in zip(range(df.count()), df.collect()):
        is_incremental, schema, table, location, hdfs_file = row['is_incremental'], row['schema_name'], row['table_name'], row['hdfs_upload_location'], row['hdfs_file_name'] 
        hdfs_path = f"{location}{hdfs_file}"
        
        if is_incremental:
            start_date, end_date, date_col = row['start_date_time'], row['end_date_time'], row['inc_field']
 
            query = f"(SELECT * FROM {schema}.{table} WHERE {date_col} BETWEEN '{start_date}' AND '{end_date}') AS sql_query"
            result = table_df(schema, query)
            result.write.mode('append').parquet(hdfs_path)
            
            update_date_on_config_table('config_db', 'cf_etl_table', i)

        elif not is_incremental:
            result = table_df(schema, table)
            result.write.mode("overwrite").parquet(hdfs_path)  

In [5]:
# def get_column_value(df, index, col):
#     """
#     This function retrieves a value from a specified column in a DataFrame at a given index.
    
#     """
#     col_values = df.select(col).collect()
#     value = col_values[index][col]
#     return value

# def upload():

#     df = table_df('config_db','cf_etl_table')

#     for i in range(df.count()):
        
#         is_incremental = get_column_value(df, i, 'is_incremental')
#         schema = get_column_value(df, i, 'schema_name')
#         table = get_column_value(df, i, 'table_name')
#         location = get_column_value(df, i, 'hdfs_upload_location')
#         hdfs_file = get_column_value(df, i, 'hdfs_file_name')
#         hdfs_path = f"{location}{hdfs_file}"

#         if is_incremental:
#             start_date = get_column_value(df, i, 'start_date_time')
#             end_date = get_column_value(df, i, 'end_date_time')
#             date_col = get_column_value(df, i, 'inc_field')      
 
#             query = f"(SELECT * FROM {schema}.{table} WHERE {date_col} BETWEEN '{start_date}' AND '{end_date}') AS sql_query"
#             result = table_df(schema, query)
#             result.write.mode('append').parquet(hdfs_path)
            
#             update_date_on_config_table('config_db', 'cf_etl_table', i)

#         elif not is_incremental:
#             result = table_df(schema, table)
#             result.write.mode("overwrite").parquet(hdfs_path)            


In [18]:
upload()

### Reading Parquet File

In [19]:
new_df = spark.read.parquet('hdfs://localhost:19000//mydir/transaction')

In [20]:
new_df.show()

+------+-------------------+------+--------+------+
|tnx_id|           tnx_date|acc_id| product|status|
+------+-------------------+------+--------+------+
|     5|2023-06-27 14:30:00|ACC005|ProductE| false|
|     2|2023-06-24 11:00:00|ACC002|ProductB| false|
|     1|2023-06-23 10:30:00|ACC001|ProductA| false|
|     4|2023-06-26 12:15:00|ACC004|ProductD| false|
|     1|2023-06-23 10:30:00|ACC001|ProductA| false|
|     1|2023-06-23 10:30:00|ACC001|ProductA| false|
|     1|2023-06-23 10:30:00|ACC001|ProductA| false|
|     3|2023-06-25 09:45:00|ACC003|ProductC| false|
+------+-------------------+------+--------+------+



### See if Hadoop File Exists

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName('Incremental Load') \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:19000") \
    .getOrCreate()
    
hadoop_conf = spark._jsc.hadoopConfiguration()
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)

hdfs_path = "hdfs://localhost:19000//mydir/transaction"

path = spark._jvm.org.apache.hadoop.fs.Path(hdfs_path)
if fs.exists(path):
    print('file exists')
else:
    print('file doesn\'t exist')

file exists
