In [0]:
df = spark.read.load("/Volumes/dai/phase2/bronze/customer_df_without_target")
display(df.head(10))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date
17420,598.8300000000002,30,265,2011-10-20
16552,379.73,17,219,2011-04-11
17572,226.75,12,95,2011-09-29
15350,115.65,5,51,2010-12-01
12921,16389.740000000016,741,9454,2011-12-06
13090,8689.390000000003,161,2194,2011-12-01
14135,4690.31,134,3850,2011-12-08
12915,363.65,22,93,2011-07-14
17685,3191.5299999999997,130,1956,2011-11-23
17581,10736.109999999991,452,5861,2011-12-09


In [0]:
display(df.head(35))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date
17420,598.8300000000002,30,265,2011-10-20
16552,379.73,17,219,2011-04-11
17572,226.75,12,95,2011-09-29
15350,115.65,5,51,2010-12-01
12921,16389.740000000016,741,9454,2011-12-06
13090,8689.390000000003,161,2194,2011-12-01
14135,4690.31,134,3850,2011-12-08
12915,363.65,22,93,2011-07-14
17685,3191.5299999999997,130,1956,2011-11-23
17581,10736.109999999991,452,5861,2011-12-09


In [0]:
from pyspark.sql.functions import abs,col

customer_df = df.withColumn("total_spent", abs(col("total_spent"))) \
        .withColumn("total_quantity",abs(col('total_quantity')))
display(customer_df.head(35))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date
17420,598.8300000000002,30,265,2011-10-20
16552,379.73,17,219,2011-04-11
17572,226.75,12,95,2011-09-29
15350,115.65,5,51,2010-12-01
12921,16389.740000000016,741,9454,2011-12-06
13090,8689.390000000003,161,2194,2011-12-01
14135,4690.31,134,3850,2011-12-08
12915,363.65,22,93,2011-07-14
17685,3191.5299999999997,130,1956,2011-11-23
17581,10736.109999999991,452,5861,2011-12-09


## Creating Target Feature for this dataset

In [0]:
# Calculate the 80th percentile value of the 'total_spent' column in customer_df
# This value will be used as a threshold to identify high-value customers
quantile_value = customer_df.approxQuantile("total_spent", [0.8], 0.01)[0]

In [0]:
from pyspark.sql.functions import when

customer_df = customer_df.withColumn(
    "is_high_valued",
    when(col("total_spent") >= quantile_value, 1).otherwise(0)
)

In [0]:
display(customer_df.head(10))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date,is_high_valued
17420,598.8300000000002,30,265,2011-10-20,0
16552,379.73,17,219,2011-04-11,0
17572,226.75,12,95,2011-09-29,0
15350,115.65,5,51,2010-12-01,0
12921,16389.740000000016,741,9454,2011-12-06,1
13090,8689.390000000003,161,2194,2011-12-01,1
14135,4690.31,134,3850,2011-12-08,1
12915,363.65,22,93,2011-07-14,0
17685,3191.5299999999997,130,1956,2011-11-23,1
17581,10736.109999999991,452,5861,2011-12-09,1


In [0]:
# spark.sql("""
# create volume if not exists dai.phase2.silver          
# """)

DataFrame[]

In [0]:
customer_df.write.format("delta").mode("overwrite").save("/Volumes/dai/phase2/silver/final_data")

---

### Now, we have to check for `class imbalance` and split the data for training and testing

In [0]:
df1 = spark.read.load("/Volumes/dai/phase2/silver/final_data")
display(df1)

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date,is_high_valued
17420.0,598.8300000000002,30,265,2011-10-20,0
16552.0,379.73,17,219,2011-04-11,0
17572.0,226.75,12,95,2011-09-29,0
15350.0,115.65,5,51,2010-12-01,0
12921.0,16389.740000000016,741,9454,2011-12-06,1
13090.0,8689.390000000003,161,2194,2011-12-01,1
14135.0,4690.31,134,3850,2011-12-08,1
12915.0,363.65,22,93,2011-07-14,0
17685.0,3191.5299999999997,130,1956,2011-11-23,1
17581.0,10736.109999999991,452,5861,2011-12-09,1


In [0]:
df1.groupBy("is_high_valued").count().show()

+--------------+-----+
|is_high_valued|count|
+--------------+-----+
|             1|  904|
|             0| 3469|
+--------------+-----+



In [0]:
minority, majority = df1.groupBy("is_high_valued").count().collect()
print(minority['count'],majority['count'])

904 3469


In [0]:
df1.count(),len(df1.columns)

(4373, 6)

---

Since the data is imbalanced, we have to upsample the minority class. For this, we are using `class weights` instead of oversampling the minority class. This way, it will not be overfitted when trained by model

---

In [0]:
from pyspark.sql.functions import when, col

count_0 = majority['count']
count_1 = minority['count']
total = count_0 + count_1

weight_0 = total / (2 * count_0)
weight_1 = total / (2 * count_1)

df1 = df1.withColumn(
    "class_weight",
    when(col("is_high_valued") == 1, weight_1).otherwise(weight_0)
)

In [0]:
display(df1.head(10))

CustomerID,total_spent,total_transactions,total_quantity,last_purchase_date,is_high_valued,class_weight
17420,598.8300000000002,30,265,2011-10-20,0,0.6302969155376189
16552,379.73,17,219,2011-04-11,0,0.6302969155376189
17572,226.75,12,95,2011-09-29,0,0.6302969155376189
15350,115.65,5,51,2010-12-01,0,0.6302969155376189
12921,16389.740000000016,741,9454,2011-12-06,1,2.418694690265487
13090,8689.390000000003,161,2194,2011-12-01,1,2.418694690265487
14135,4690.31,134,3850,2011-12-08,1,2.418694690265487
12915,363.65,22,93,2011-07-14,0,0.6302969155376189
17685,3191.5299999999997,130,1956,2011-11-23,1,2.418694690265487
17581,10736.109999999991,452,5861,2011-12-09,1,2.418694690265487


In [0]:
df1.write.format("delta").mode("overwrite").save("/Volumes/dai/phase2/silver/final_data_with_weights")

In [0]:
train_df, test_df = df1.randomSplit([0.8, 0.2], seed=42)

In [0]:
print("Train count:", train_df.count())
print("Test count:", test_df.count())

Train count: 3485
Test count: 888


In [0]:
train_df.groupBy("is_high_valued").count().show()
test_df.groupBy("is_high_valued").count().show()

+--------------+-----+
|is_high_valued|count|
+--------------+-----+
|             1|  733|
|             0| 2752|
+--------------+-----+

+--------------+-----+
|is_high_valued|count|
+--------------+-----+
|             1|  171|
|             0|  717|
+--------------+-----+

