In [0]:
%spark.pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, trim, udf, to_date, from_unixtime
from pyspark.sql.types import StringType, LongType

# Start Spark with Hive support
spark = SparkSession.builder \
    .appName("BankLoans") \
    .enableHiveSupport() \
    .getOrCreate()

# 1. Load raw parquet from staging-zone (Sqoop import output)
financial_df = spark.read.parquet("/staging_zone_6/financial_loan")

# 2. Handle date columns robustly
date_cols = ["issue_date", "last_credit_pull_date", "last_payment_date", "next_payment_date"]
for c in date_cols:
    dtype = dict(financial_df.dtypes)[c]
    if dtype in ["bigint", "long", "int"]:
        # UNIX milliseconds → divide by 1000 → to_date
        financial_df = financial_df.withColumn(c, to_date(from_unixtime(col(c).cast("double") / 1000)))
    else:
        # Assume string in dd-MM-yyyy
        financial_df = financial_df.withColumn(c, to_date(col(c), "dd-MM-yyyy"))

# 3. Map state codes → full names
state_dict = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas",
    "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts",
    "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana",
    "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
    "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
    "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin",
    "WY": "Wyoming", "DC": "District of Columbia"
}
code_to_name_udf = udf(lambda code: state_dict.get(code, "Unknown"), StringType())
financial_df = financial_df.withColumn("address_state", code_to_name_udf(col("address_state")))

# 4. Clean emp_length column
financial_df = financial_df.withColumn("emp_length", trim(regexp_replace(col("emp_length"), "[<>\\+]| years?|year", "")))

# 5. Clean term column
financial_df = financial_df.withColumn("term", trim(regexp_replace(col("term"), " months?", "")))

# 6. Fill nulls for categorical fields
financial_df = financial_df.fillna({
    "emp_title": "Unknown",
    "application_type": "Unknown",
    "loan_status": "Unknown"
})

# Show sample
financial_df.show(10, truncate=False)

# 7. Save cleaned dataset into clean-zone
financial_df.write.mode("overwrite").parquet("/clean-zone/financial_loan_cleaned")

print("✅ Step complete: Cleaned data written to /clean-zone/financial_loan_cleaned")


In [1]:
%spark.pyspark
spark = SparkSession.builder \
    .appName("BankLoans") \
    .enableHiveSupport() \
    .getOrCreate()


In [2]:
%spark.pyspark
spark.sql("SHOW DATABASES").show()


In [3]:
%spark.pyspark
financial_df.write.mode("overwrite") \
    .format("hive") \
    .saveAsTable("default.financial_loan_cleaned")


In [4]:
%spark.pyspark
from pyspark.sql.functions import col, monotonically_increasing_id

dim_borrowers = financial_df.select(
    col("member_id").alias("borrowers_id_bk"),   # Business Key
    col("emp_title").alias("employment_title"),
    col("emp_length").alias("employment_length"),
    col("annual_income"),
    col("home_ownership"),
    col("address_state").alias("state_code"),
    col("total_acc").alias("total_account"),
    col("verification_status"),
    col("application_type")
).dropDuplicates(["borrowers_id_bk"]) \
 .withColumn("borrowers_id_sk", monotonically_increasing_id())

# نخلي الـ surrogate key أول عمود
dim_borrowers = dim_borrowers.select(
    "borrowers_id_sk",
    "borrowers_id_bk",
    "employment_title",
    "employment_length",
    "annual_income",
    "home_ownership",
    "state_code",
    "total_account",
    "verification_status",
    "application_type"
)

dim_borrowers.show()

In [5]:
%spark.pyspark
dim_borrowers.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("dim_borrowers")

In [6]:
%spark.pyspark
from pyspark.sql.functions import col, monotonically_increasing_id, when

dim_status = financial_df.select(
    col("loan_status").alias("loan_status")
).dropDuplicates(["loan_status"]) \
 .withColumn("status_id_sk", monotonically_increasing_id())

# Add loan_status_category column
dim_status = dim_status.withColumn(
    "loan_status_category",
    when(col("loan_status").isin("Fully Paid", "Current"), "Good")
    .otherwise("Bad")
)

# Final selection
dim_status = dim_status.select(
    "status_id_sk",
    col("loan_status").alias("status_id"),
    "loan_status_category"
)

dim_status.show()

In [7]:
%spark.pyspark
dim_status.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("dim_status")

In [8]:
%spark.pyspark
dim_credit_grade = financial_df.select(
    col("grade"),
    col("sub_grade")
).dropDuplicates(["sub_grade"]) \
 .withColumn("credit_grade_sk", monotonically_increasing_id())

dim_credit_grade = dim_credit_grade.select(
    "credit_grade_sk",
    "grade",
    "sub_grade"
)
dim_credit_grade.show()

In [9]:
%spark.pyspark
dim_credit_grade.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("dim_credit_grade")

In [10]:
%spark.pyspark
from pyspark.sql.functions import col, monotonically_increasing_id, concat_ws, lit

# Step 1: نجيب العمود period
dim_loan_term = financial_df.select(
    col("term").alias("period")
).dropDuplicates(["period"]) \
 .withColumn("loan_term_sk", monotonically_increasing_id())

# Step 2: نضيف الوصف term_description
dim_loan_term = dim_loan_term.select(
    "loan_term_sk",
    "period"
).withColumn(
    "term_description",
    concat_ws(" ", col("period"), lit("months"))
)

dim_loan_term.show()

In [11]:
%spark.pyspark
dim_loan_term.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("dim_loan_term")

In [12]:
%spark.pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, date_format, lit
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from datetime import datetime, timedelta

spark = SparkSession.builder \
    .appName("StarSchema") \
    .enableHiveSupport() \
    .getOrCreate()

# Generate all dates for 2021
start_date = datetime(2021, 1, 1)
end_date = datetime(2021, 12, 31)

date_list = [(start_date + timedelta(days=x),) for x in range((end_date - start_date).days + 1)]

df_dates = spark.createDataFrame(date_list, ["Date"])

# Add Date_key in YYYYMMDD format
df_dates = df_dates.withColumn("Date_key", F.date_format(col("Date"), "yyyyMMdd").cast(IntegerType()))

# Extract Year, Month, Month_name, Quarter
df_dates = df_dates.withColumn("Year", year(col("Date"))) \
                   .withColumn("Month", month(col("Date"))) \
                   .withColumn("Month_name", date_format(col("Date"), "MMMM")) \
                   .withColumn("Quarter", F.quarter(col("Date")))

# Reorder columns
df_dates = df_dates.select("Date_key", "Date", "Year", "Month", "Month_name", "Quarter")

df_dates.show(5, truncate=False)


In [13]:
%spark.pyspark
# Save the dim_date table to Hive
df_dates.write \
    .mode("overwrite") \
    .format("parquet") \
    .saveAsTable("dim_date")  # this creates the table in Hive


In [14]:
%spark.pyspark
from pyspark.sql.functions import col, to_date, monotonically_increasing_id

# Convert string columns to date
financial_df = financial_df \
    .withColumn("issue_date_dt", to_date(col("issue_date"), "yyyy-MM-dd")) \
    .withColumn("last_payment_date_dt", to_date(col("last_payment_date"), "yyyy-MM-dd")) \
    .withColumn("next_payment_date_dt", to_date(col("next_payment_date"), "yyyy-MM-dd")) \
    .withColumn("last_credit_pull_date_dt", to_date(col("last_credit_pull_date"), "yyyy-MM-dd"))

# Join with dim_date for issue_date
fact_loan = financial_df.join(
    df_dates.select(col("Date_key").alias("date_key_issue"), col("Date").alias("issue_date_dim")),
    financial_df.issue_date_dt == col("issue_date_dim"),
    "left"
)

# Join with dim_date for last_payment_date
fact_loan = fact_loan.join(
    df_dates.select(col("Date_key").alias("date_key_last_payment"), col("Date").alias("last_payment_date_dim")),
    fact_loan.last_payment_date_dt == col("last_payment_date_dim"),
    "left"
)

# Join with dim_date for next_payment_date
fact_loan = fact_loan.join(
    df_dates.select(col("Date_key").alias("date_key_next_payment"), col("Date").alias("next_payment_date_dim")),
    fact_loan.next_payment_date_dt == col("next_payment_date_dim"),
    "left"
)

# Join with dim_date for last_credit_pull_date
fact_loan = fact_loan.join(
    df_dates.select(col("Date_key").alias("date_key_last_credit_pull"), col("Date").alias("last_credit_pull_date_dim")),
    fact_loan.last_credit_pull_date_dt == col("last_credit_pull_date_dim"),
    "left"
)

# Select final columns
fact_loan = fact_loan.select(
    col("id").alias("loan_id_bk"),
    col("member_id").alias("borrowers_id_k"),
    col("date_key_issue"),
    col("date_key_last_payment"),
    col("date_key_next_payment"),
    col("date_key_last_credit_pull"),
    col("grade").alias("credit_grade_k"),
    col("term").alias("loan_term_lk"),
    col("loan_amount"),
    col("dti").alias("DTI"),
    col("installment"),
    col("int_rate").alias("interest_rate"),
    col("total_payment"),
    col("purpose").alias("loan_purpose"),
    col("loan_status").alias("status_id_k")
).withColumn("loan_id_pk_sk", monotonically_increasing_id())

fact_loan.show()


In [15]:
%spark.pyspark
# Save fact_loan to Hive
fact_loan.write \
    .format("parquet") \
    .mode("overwrite") \
    .saveAsTable("fact_loan")
