In [1]:
filePath = "/FileStore/tables/GroupData/" #put your own file path if necessary

#Importing the files one by one
Complaints = spark.read\
  .format("csv")\
  .option("inferSchema","true")\
  .option("header","true")\
  .option("delimiter",",")\
  .option("0","NA")\
  .load(filePath + "BDT2_1920_Complaints.csv")\

Delivery=spark.read\
  .format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .option("delimiter",",")\
  .load(filePath + "BDT2_1920_Delivery.csv")

Subscriptions=spark.read\
  .format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .option("delimiter",",")\
  .load(filePath + "BDT2_1920_Subscriptions.csv")

Customers=spark.read\
  .format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .option("delimiter",",")\
  .load(filePath + "BDT2_1920_Customers.csv")

Formula=spark.read\
  .format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .option("delimiter",",")\
  .load(filePath + "BDT2_1920_Formula.csv")

In [2]:
from pyspark.sql.functions import *

#Replacing NA in Complaints
#replacing NA with meaningfull value when possible.
#unknown ID will take the value 0
#unknown numeric values like quantities will take the value 999
#NA values in string type column will take a "NA" value or a "no response"/"no solution" depending on the context

#Replacing NA in Complaints
Complaints = Complaints.withColumn("ProductID", when(Complaints["ProductID"] == "NA", 0).otherwise(Complaints["ProductID"]))\
  .withColumn("ProductName", when(Complaints["ProductName"] == "NA", "NA").otherwise(Complaints["ProductName"]))\
  .withColumn("FeedbackTypeID", when(Complaints["FeedbackTypeID"] == "NA", 0).otherwise(Complaints["FeedbackTypeID"]))\
  .withColumn("FeedbackTypeDesc", when(Complaints["FeedbackTypeDesc"] == "NA", "no response").otherwise(Complaints["FeedbackTypeDesc"]))\
  .withColumn("SolutionTypeID", when(Complaints["SolutionTypeID"] == "NA", 0).otherwise(Complaints["SolutionTypeID"]))\
  .withColumn("SolutionTypeDesc", when(Complaints["SolutionTypeDesc"] == "NA", "no solution").otherwise(Complaints["SolutionTypeDesc"]))

#Replacing NA in Delivery
Delivery = Delivery.na.fill("NA", "DeliveryClass")

#Replacing NA in Subscriptions
#NbrMeals_EXCEP NAs have been replaced by the mean NbrMeals_EXCEP ordered by the same NbrMeals_REG category
Subscriptions = Subscriptions.withColumn("NbrMeals_EXCEP",when((Subscriptions["NbrMeals_EXCEP"] == "NA") & (Subscriptions["NbrMeals_REG"]==76), 12).otherwise(Subscriptions["NbrMeals_EXCEP"]))
Subscriptions = Subscriptions.withColumn("NbrMeals_EXCEP",when((Subscriptions["NbrMeals_EXCEP"] == "NA") & (Subscriptions["NbrMeals_REG"]==304), 13).otherwise(Subscriptions["NbrMeals_EXCEP"]))
Subscriptions = Subscriptions.withColumn("NbrMeals_EXCEP",when((Subscriptions["NbrMeals_EXCEP"] == "NA") & (Subscriptions["NbrMeals_REG"]==329), 17).otherwise(Subscriptions["NbrMeals_EXCEP"]))
Subscriptions = Subscriptions.withColumn("NbrMeals_EXCEP",when((Subscriptions["NbrMeals_EXCEP"] == "NA") & (Subscriptions["NbrMeals_REG"]==152), 13).otherwise(Subscriptions["NbrMeals_EXCEP"]))

#RenewalDate 1 and 0 (so if a client renewed 6 times his subscription, the value can be summed to 6)
Subscriptions = Subscriptions.withColumn("RenewalDate",when(Subscriptions["RenewalDate"] == "NA",0).otherwise(1))

#PaymentDate Redondant with PaymentStatus
  #GrossFormulaPrice
  #NetFormulaPrice
  #NbrMealsPrice
  #ProductDiscount
  #FormulaDiscount
  #TotalDiscount
  #TotalPrice
  #TotalCredit
  #All of those are codependent. Maybe after grouping the NbrMeals_REG/EXCEP features, we can replace NA's by the mean of 
  # the category they belong to


In [3]:
#Complaints
Complaints = Complaints.withColumn("ProductID", Complaints["ProductID"].cast("integer"))\
  .withColumn("SolutionTypeID", Complaints["SolutionTypeID"].cast("integer"))\
  .withColumn("FeedbackTypeID", Complaints["FeedbackTypeID"].cast("integer"))

#Subscriptions
Subscriptions = Subscriptions.withColumn("NbrMeals_EXCEP",Subscriptions["NbrMeals_EXCEP"].cast("integer"))
Subscriptions = Subscriptions.withColumn("RenewalDate",Subscriptions["RenewalDate"].cast("integer"))
  #converting timestamps to number of days
Subscriptions = Subscriptions.withColumn("EndDate",Subscriptions["EndDate"].cast("long")/86400)
Subscriptions = Subscriptions.withColumn("StartDate",Subscriptions["StartDate"].cast("long")/86400)
Complaints = Complaints.withColumn("ComplaintDate",Complaints["ComplaintDate"].cast("long")/86400)

In [4]:
#Subscriptions
Subscriptions = Subscriptions.withColumn("SubscriptionDuration", Subscriptions.EndDate - Subscriptions.StartDate)
Subscriptions = Subscriptions.withColumn("NbrMealsPerDay", when(Subscriptions["SubscriptionDuration"] == 0, Subscriptions.NbrMeals_REG).otherwise(Subscriptions.NbrMeals_REG / Subscriptions.SubscriptionDuration))

In [5]:
display(Subscriptions)

SubscriptionID,CustomerID,StartDate,EndDate,NbrMeals_REG,NbrMeals_EXCEP,RenewalDate,PaymentType,PaymentStatus,PaymentDate,FormulaID,GrossFormulaPrice,NetFormulaPrice,NbrMealsPrice,ProductDiscount,FormulaDiscount,TotalDiscount,TotalPrice,TotalCredit,ProductName,SubscriptionDuration,NbrMealsPerDay
627529,775138,17135.0,17225.0,76,10,1,BT,Paid,2016-12-01,919,1480.0,1480.0,19.47368,0.0,0.0,0.0,1480.0,0.0,Custom Events,90.0,0.8444444444444444
637001,194809,17039.0,17227.0,152,25,1,BT,Paid,2016-08-22,4192,2760.0,1760.0,11.57894,0.0,1000.0,1000.0,1760.0,0.0,Custom Events,188.0,0.8085106382978723
1238870,654824,17860.0,17950.0,76,10,1,BT,Paid,2018-11-09,10961,1580.0,1580.0,20.78948,0.0,0.0,0.0,1580.0,0.0,Custom Events,90.0,0.8444444444444444
315743,626815,16801.0,17165.0,304,10,1,BT,Paid,2015-12-18,896,4980.0,4980.0,16.38158,0.0,0.0,0.0,4980.0,0.0,Custom Events,364.0,0.8351648351648352
1176762,1016426,17760.0,17788.0,25,25,0,BT,Paid,2018-08-13,12867,540.0,300.0,12.0,0.0,240.0,240.0,300.0,0.0,Custom Events,28.0,0.8928571428571429
916472,871676,17491.0,17581.0,76,10,1,BT,Paid,2017-11-20,5100,1540.0,1540.0,20.26316,0.0,0.0,0.0,1540.0,0.0,Custom Events,90.0,0.8444444444444444
646275,655981,17099.0,17130.0,25,10,1,DD,Paid,2016-10-15,924,520.0,520.0,20.8,0.0,0.0,0.0,520.0,0.0,Custom Events,31.0,0.8064516129032258
752611,704300,17230.0,17258.0,25,25,1,BT,Paid,2017-03-24,5389,458.0,458.0,18.32,0.0,0.0,0.0,458.0,0.0,Custom Events,28.0,0.8928571428571429
1079202,684448,17652.0,17684.0,25,25,1,DD,Paid,2018-04-21,9466,472.0,472.0,18.88,0.0,0.0,0.0,472.0,0.0,Custom Events,32.0,0.78125
669473,276941,17122.0,17487.0,304,25,1,BT,Paid,2016-11-24,891,4980.0,4980.0,16.38158,0.0,0.0,0.0,4980.0,0.0,Custom Events,365.0,0.8328767123287671


In [6]:
Subscriptions.createOrReplaceTempView("subscriptions")

In [7]:
SubInter = spark.sql("select CustomerID, sum(NbrMeals_REG) as TotalMeal_REG, avg(NbrMeals_REG) as MeanMeal_REGPerSub, sum(NbrMeals_EXCEP) as TotalMeal_EXCEP, avg(NbrMeals_EXCEP) as MeanMeal_EXCEPPerSub, min(StartDate) as FirstSubDate, max(EndDate) as EndOfLastSub, (max(EndDate)-min(StartDate)) as HasBeenClientForXDays,count(SubscriptionID) as NbrSub, SUM(CASE WHEN PaymentStatus='Paid' THEN 1 ELSE 0 END) as SubPaid, SUM(CASE WHEN PaymentStatus='Not Paid' THEN 1 ELSE 0 END) as SubNotPaid, SUM(CASE WHEN PaymentStatus='Paid' THEN 1 ELSE 0 END)/count(SubscriptionID) as ProportionPaidSub,avg(NbrMealsPrice) as AvgPricePerMeal, sum(ProductDiscount) as TotalProductDiscount, sum(FormulaDiscount) as TotalFormulaDiscount, sum(TotalDiscount) as TotalDiscount, sum(TotalPrice) as TotalPrice, sum(TotalCredit) as TotalCredit,sum(SubscriptionDuration) as NbrDaysSub, avg(SubscriptionDuration) as AvgDurationPerSub, avg(NbrMealsPerDay) as AverageNbrMealPerDay, SUM(CASE WHEN ProductName='Custom Events' THEN 1 ELSE 0 END) as NbrCustomEventsProduct, SUM(CASE WHEN ProductName!='Custom Events' THEN 1 ELSE 0 END) as NbrGrubProduct from subscriptions group by CustomerID")
SubInter = SubInter.withColumn("FirstSubDate", SubInter.FirstSubDate*86400)
SubInter = SubInter.withColumn("FirstSubDate", SubInter.FirstSubDate.cast("timestamp"))
SubInter = SubInter.withColumn("EndOfLastSub", SubInter.EndOfLastSub*86400)
SubInter = SubInter.withColumn("EndOfLastSub", SubInter.EndOfLastSub.cast("timestamp"))
#SubInter = SubInter.withColumn("FirstSubDate", SubInter.select((unix_timestamp("FirstSubDate","yyy/MM/dd HH:mm:ss")).cast("timestamp")))
#df.select((unix_timestamp($"Date", "MM/dd/yyyy HH:mm:ss") * 1000).cast("timestamp"), $"Date")

#creating the churn dependent variable
SubInter = SubInter.withColumn("ChurnedAt03/02/2019", when(col("EndOfLastSub") > "2019-02-02 00:00:00", 0).otherwise(1))
SubInter = SubInter.withColumn("ChurnedAt03/08/2018", when(col("EndOfLastSub") > "2018-08-02 00:00:00", 0).otherwise(1))
SubInter = SubInter.withColumn("ChurnedAt03/02/2018", when(col("EndOfLastSub") > "2018-02-02 00:00:00", 0).otherwise(1))

display(SubInter)

CustomerID,TotalMeal_REG,MeanMeal_REGPerSub,TotalMeal_EXCEP,MeanMeal_EXCEPPerSub,FirstSubDate,EndOfLastSub,HasBeenClientForXDays,NbrSub,SubPaid,SubNotPaid,ProportionPaidSub,AvgPricePerMeal,TotalProductDiscount,TotalFormulaDiscount,TotalDiscount,TotalPrice,TotalCredit,NbrDaysSub,AvgDurationPerSub,AverageNbrMealPerDay,NbrCustomEventsProduct,NbrGrubProduct,ChurnedAt03/02/2019,ChurnedAt03/08/2018,ChurnedAt03/02/2018
258487,785,196.25,55,13.75,2016-05-02T00:00:00.000+0000,2018-08-31T00:00:00.000+0000,851.0,4,3,1,0.75,17.216646666666666,0.0,409.6,409.6,11640.0,0.0,849.0,212.25,19.628179522497707,4,0,1,0,0
671995,1254,25.08,677,13.54,2015-01-02T00:00:00.000+0000,2019-02-14T00:00:00.000+0000,1504.0,50,50,0,1.0,20.5868036,0.0,0.0,0.0,25800.0,0.0,1452.0,29.04,0.8644984727298408,0,50,0,0,0
285977,1519,303.8,70,14.0,2014-01-04T00:00:00.000+0000,2019-02-07T00:00:00.000+0000,1860.0,5,5,0,1.0,15.705696,0.0,0.0,0.0,23858.2,-721.8000000000001,1856.0,371.2,0.818706760989113,0,5,0,0,0
682942,1519,303.8,85,17.0,2014-02-01T00:00:00.000+0000,2019-01-31T00:00:00.000+0000,1825.0,5,5,0,1.0,16.180996,0.0,0.0,0.0,24580.0,0.0,1821.0,364.2,0.834157760048171,0,5,1,0,0
104880,1216,76.0,235,14.6875,2015-01-02T00:00:00.000+0000,2018-12-30T00:00:00.000+0000,1458.0,16,16,0,1.0,19.6064425,0.0,0.0,0.0,23840.0,0.0,1441.0,90.0625,0.8439920836606536,0,16,1,0,0
965578,1216,304.0,55,13.75,2014-12-19T00:00:00.000+0000,2018-12-17T00:00:00.000+0000,1459.0,4,4,0,1.0,15.62171,0.0,386.6,386.6,18996.0,0.0,1456.0,364.0,0.8351648351648352,0,4,1,0,0
829912,684,114.0,65,10.833333333333334,2016-03-19T00:00:00.000+0000,2017-12-31T00:00:00.000+0000,652.0,6,5,1,0.8333333333333334,14.713593333333334,0.0,787.11578,787.11578,9108.8,0.0,647.0,107.83333333333331,2.812704198909744,0,6,1,1,1
673836,1216,304.0,55,13.75,2015-01-02T00:00:00.000+0000,2019-01-03T00:00:00.000+0000,1462.0,4,4,0,1.0,16.41447,0.0,0.0,0.0,19960.0,-40.0,1459.0,364.75,0.8334705478974254,3,1,1,0,0
659301,1218,304.5,60,15.0,2014-06-23T00:00:00.000+0000,2018-06-24T00:00:00.000+0000,1462.0,4,4,0,1.0,15.961825,0.0,0.0,0.0,19440.0,0.0,1458.0,364.5,0.8353925184404636,0,4,1,1,0
1012153,304,304.0,25,25.0,2017-03-02T00:00:00.000+0000,2018-03-01T00:00:00.000+0000,364.0,1,1,0,1.0,0.0,5180.0,0.0,5180.0,0.0,0.0,364.0,364.0,0.8351648351648352,0,1,1,1,0


In [8]:
Complaints.createOrReplaceTempView("complaints")

In [9]:
Intermediary = spark.sql("select CustomerID, count(ComplaintID) as NbrComplaints, max(ComplaintDate) as LastComplaint, min(ComplaintDate) as FirstComplaint, (CASE WHEN count(ComplaintID)>1 THEN (count(ComplaintID)/(max(ComplaintDate)-min(ComplaintDate))) ELSE 0 END) as ComplaintsPerMonth, SUM(CASE WHEN ProductID=1 THEN 1 ELSE 0 END) as NbrComplaintsProduct1, SUM(CASE WHEN ProductID=2 THEN 1 ELSE 0 END) as NbrComplaintsProduct2, SUM(CASE WHEN ProductID=3 THEN 1 ELSE 0 END) as NbrComplaintsProduct3, SUM(CASE WHEN ProductID=4 THEN 1 ELSE 0 END) as NbrComplaintsProduct4, SUM(CASE WHEN ProductID=5 THEN 1 ELSE 0 END) as NbrComplaintsProduct5, SUM(CASE WHEN ProductID=6 THEN 1 ELSE 0 END) as NbrComplaintsProduct6, SUM(CASE WHEN ProductID=7 THEN 1 ELSE 0 END) as NbrComplaintsProduct7 , SUM(CASE WHEN ProductID=8 THEN 1 ELSE 0 END) as NbrComplaintsProduct8, SUM(CASE WHEN ProductID=0 THEN 1 ELSE 0 END) as NbrComplaintsProductUnknown,SUM(CASE WHEN ComplaintTypeID=1 THEN 1 ELSE 0 END) as NbrComplaintsType1, SUM(CASE WHEN ComplaintTypeID=2 THEN 1 ELSE 0 END) as NbrComplaintsType2, SUM(CASE WHEN ComplaintTypeID=3 THEN 1 ELSE 0 END) as NbrComplaintsType3, SUM(CASE WHEN ComplaintTypeID=4 THEN 1 ELSE 0 END) as NbrComplaintsType4, SUM(CASE WHEN ComplaintTypeID=5 THEN 1 ELSE 0 END) as NbrComplaintsType5, SUM(CASE WHEN ComplaintTypeID=6 THEN 1 ELSE 0 END) as NbrComplaintsType6, SUM(CASE WHEN ComplaintTypeID=7 THEN 1 ELSE 0 END) as NbrComplaintsType7 , SUM(CASE WHEN ComplaintTypeID=8 THEN 1 ELSE 0 END) as NbrComplaintsType8, SUM(CASE WHEN ComplaintTypeID=9 THEN 1 ELSE 0 END) as NbrComplaintsType9, SUM(CASE WHEN ComplaintTypeID=0 THEN 1 ELSE 0 END) as NbrComplaintsTypeUnknown, SUM(CASE WHEN SolutionTypeID=1 THEN 1 ELSE 0 END) as NbrSolutionsType1, SUM(CASE WHEN SolutionTypeID=2 THEN 1 ELSE 0 END) as NbrSolutionsType2, SUM(CASE WHEN SolutionTypeID=3 THEN 1 ELSE 0 END) as NbrSolutionsType3, SUM(CASE WHEN SolutionTypeID=4 THEN 1 ELSE 0 END) as NbrSolutionsType4, SUM(CASE WHEN SolutionTypeID=0 THEN 1 ELSE 0 END) as NbrSolutionsTypeUnknown from complaints group by CustomerID")
Intermediary = Intermediary.withColumn("FirstComplaint", Intermediary.FirstComplaint*86400)
Intermediary = Intermediary.withColumn("FirstComplaint", Intermediary.FirstComplaint.cast("timestamp"))
Intermediary = Intermediary.withColumn("LastComplaint", Intermediary.LastComplaint*86400)
Intermediary = Intermediary.withColumn("LastComplaint", Intermediary.LastComplaint.cast("timestamp"))

display(Intermediary)

CustomerID,NbrComplaints,LastComplaint,FirstComplaint,ComplaintsPerMonth,NbrComplaintsProduct1,NbrComplaintsProduct2,NbrComplaintsProduct3,NbrComplaintsProduct4,NbrComplaintsProduct5,NbrComplaintsProduct6,NbrComplaintsProduct7,NbrComplaintsProduct8,NbrComplaintsProductUnknown,NbrComplaintsType1,NbrComplaintsType2,NbrComplaintsType3,NbrComplaintsType4,NbrComplaintsType5,NbrComplaintsType6,NbrComplaintsType7,NbrComplaintsType8,NbrComplaintsType9,NbrComplaintsTypeUnknown,NbrSolutionsType1,NbrSolutionsType2,NbrSolutionsType3,NbrSolutionsType4,NbrSolutionsTypeUnknown
285977,7,2018-11-27T00:00:00.000+0000,2014-09-18T00:00:00.000+0000,0.0045721750489875,0,0,0,0,0,7,0,0,0,4,1,1,0,0,0,0,0,1,0,1,0,1,0,5
671995,2,2014-10-14T00:00:00.000+0000,2014-03-25T00:00:00.000+0000,0.0098522167487684,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2
466728,1,2016-12-15T00:00:00.000+0000,2016-12-15T00:00:00.000+0000,0.0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1012153,3,2018-08-07T00:00:00.000+0000,2018-08-02T00:00:00.000+0000,0.6,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,1,1,0,0
673836,29,2018-12-24T00:00:00.000+0000,2013-11-14T00:00:00.000+0000,0.0155412647374062,0,0,0,21,0,0,0,0,8,18,2,0,0,7,0,1,0,1,0,2,2,4,0,21
865501,1,2016-02-04T00:00:00.000+0000,2016-02-04T00:00:00.000+0000,0.0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
802664,1,2013-06-21T00:00:00.000+0000,2013-06-21T00:00:00.000+0000,0.0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
204587,6,2017-02-19T00:00:00.000+0000,2015-12-14T00:00:00.000+0000,0.0138568129330254,0,0,0,0,0,0,0,6,0,6,0,0,0,0,0,0,0,0,0,6,0,0,0,0
462878,2,2017-03-02T00:00:00.000+0000,2012-04-08T00:00:00.000+0000,0.0011179429849077,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1
67278,2,2018-11-08T00:00:00.000+0000,2018-07-19T00:00:00.000+0000,0.0178571428571428,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0


In [10]:
display(SubInter)

CustomerID,TotalMeal_REG,MeanMeal_REGPerSub,TotalMeal_EXCEP,MeanMeal_EXCEPPerSub,FirstSubDate,EndOfLastSub,HasBeenClientForXDays,NbrSub,SubPaid,SubNotPaid,ProportionPaidSub,AvgPricePerMeal,TotalProductDiscount,TotalFormulaDiscount,TotalDiscount,TotalPrice,TotalCredit,NbrDaysSub,AvgDurationPerSub,AverageNbrMealPerDay,NbrCustomEventsProduct,NbrGrubProduct,ChurnedAt03/02/2019,ChurnedAt03/08/2018,ChurnedAt03/02/2018
258487,785,196.25,55,13.75,2016-05-02T00:00:00.000+0000,2018-08-31T00:00:00.000+0000,851.0,4,3,1,0.75,17.216646666666666,0.0,409.6,409.6,11640.0,0.0,849.0,212.25,19.628179522497707,4,0,1,0,0
671995,1254,25.08,677,13.54,2015-01-02T00:00:00.000+0000,2019-02-14T00:00:00.000+0000,1504.0,50,50,0,1.0,20.5868036,0.0,0.0,0.0,25800.0,0.0,1452.0,29.04,0.8644984727298408,0,50,0,0,0
285977,1519,303.8,70,14.0,2014-01-04T00:00:00.000+0000,2019-02-07T00:00:00.000+0000,1860.0,5,5,0,1.0,15.705696,0.0,0.0,0.0,23858.2,-721.8000000000001,1856.0,371.2,0.818706760989113,0,5,0,0,0
682942,1519,303.8,85,17.0,2014-02-01T00:00:00.000+0000,2019-01-31T00:00:00.000+0000,1825.0,5,5,0,1.0,16.180996,0.0,0.0,0.0,24580.0,0.0,1821.0,364.2,0.834157760048171,0,5,1,0,0
104880,1216,76.0,235,14.6875,2015-01-02T00:00:00.000+0000,2018-12-30T00:00:00.000+0000,1458.0,16,16,0,1.0,19.6064425,0.0,0.0,0.0,23840.0,0.0,1441.0,90.0625,0.8439920836606536,0,16,1,0,0
965578,1216,304.0,55,13.75,2014-12-19T00:00:00.000+0000,2018-12-17T00:00:00.000+0000,1459.0,4,4,0,1.0,15.62171,0.0,386.6,386.6,18996.0,0.0,1456.0,364.0,0.8351648351648352,0,4,1,0,0
829912,684,114.0,65,10.833333333333334,2016-03-19T00:00:00.000+0000,2017-12-31T00:00:00.000+0000,652.0,6,5,1,0.8333333333333334,14.713593333333334,0.0,787.11578,787.11578,9108.8,0.0,647.0,107.83333333333331,2.812704198909744,0,6,1,1,1
673836,1216,304.0,55,13.75,2015-01-02T00:00:00.000+0000,2019-01-03T00:00:00.000+0000,1462.0,4,4,0,1.0,16.41447,0.0,0.0,0.0,19960.0,-40.0,1459.0,364.75,0.8334705478974254,3,1,1,0,0
659301,1218,304.5,60,15.0,2014-06-23T00:00:00.000+0000,2018-06-24T00:00:00.000+0000,1462.0,4,4,0,1.0,15.961825,0.0,0.0,0.0,19440.0,0.0,1458.0,364.5,0.8353925184404636,0,4,1,1,0
1012153,304,304.0,25,25.0,2017-03-02T00:00:00.000+0000,2018-03-01T00:00:00.000+0000,364.0,1,1,0,1.0,0.0,5180.0,0.0,5180.0,0.0,0.0,364.0,364.0,0.8351648351648352,0,1,1,1,0


In [11]:
#Base Table
#base = Customers.join(Complaints,on=['CustomerID'],how='full')
#base = Customers.join(Subscriptions,on=['CustomerID'],how='full')
base = Customers.join(Intermediary,on=['CustomerID'],how='full')
base1 = base.join(SubInter,on=['CustomerID'],how='full')
#Replacing null in Complaints
base1 = base1.na.fill(0)

In [12]:
display(base1)

CustomerID,Region,StreetID,NbrComplaints,LastComplaint,FirstComplaint,ComplaintsPerMonth,NbrComplaintsProduct1,NbrComplaintsProduct2,NbrComplaintsProduct3,NbrComplaintsProduct4,NbrComplaintsProduct5,NbrComplaintsProduct6,NbrComplaintsProduct7,NbrComplaintsProduct8,NbrComplaintsProductUnknown,NbrComplaintsType1,NbrComplaintsType2,NbrComplaintsType3,NbrComplaintsType4,NbrComplaintsType5,NbrComplaintsType6,NbrComplaintsType7,NbrComplaintsType8,NbrComplaintsType9,NbrComplaintsTypeUnknown,NbrSolutionsType1,NbrSolutionsType2,NbrSolutionsType3,NbrSolutionsType4,NbrSolutionsTypeUnknown,TotalMeal_REG,MeanMeal_REGPerSub,TotalMeal_EXCEP,MeanMeal_EXCEPPerSub,FirstSubDate,EndOfLastSub,HasBeenClientForXDays,NbrSub,SubPaid,SubNotPaid,ProportionPaidSub,AvgPricePerMeal,TotalProductDiscount,TotalFormulaDiscount,TotalDiscount,TotalPrice,TotalCredit,NbrDaysSub,AvgDurationPerSub,AverageNbrMealPerDay,NbrCustomEventsProduct,NbrGrubProduct,ChurnedAt03/02/2019,ChurnedAt03/08/2018,ChurnedAt03/02/2018
104880,5,45805,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1216,76.0,235,14.6875,2015-01-02T00:00:00.000+0000,2018-12-30T00:00:00.000+0000,1458.0,16,16,0,1.0,19.6064425,0.0,0.0,0.0,23840.0,0.0,1441.0,90.0625,0.8439920836606536,0,16,1,0,0
258487,1,14628,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,785,196.25,55,13.75,2016-05-02T00:00:00.000+0000,2018-08-31T00:00:00.000+0000,851.0,4,3,1,0.75,17.216646666666666,0.0,409.6,409.6,11640.0,0.0,849.0,212.25,19.628179522497707,4,0,1,0,0
285977,1,18415,7,2018-11-27T00:00:00.000+0000,2014-09-18T00:00:00.000+0000,0.0045721750489875,0,0,0,0,0,7,0,0,0,4,1,1,0,0,0,0,0,1,0,1,0,1,0,5,1519,303.8,70,14.0,2014-01-04T00:00:00.000+0000,2019-02-07T00:00:00.000+0000,1860.0,5,5,0,1.0,15.705696,0.0,0.0,0.0,23858.2,-721.8000000000001,1856.0,371.2,0.818706760989113,0,5,0,0,0
671995,1,28929,2,2014-10-14T00:00:00.000+0000,2014-03-25T00:00:00.000+0000,0.0098522167487684,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1254,25.08,677,13.54,2015-01-02T00:00:00.000+0000,2019-02-14T00:00:00.000+0000,1504.0,50,50,0,1.0,20.5868036,0.0,0.0,0.0,25800.0,0.0,1452.0,29.04,0.8644984727298408,0,50,0,0,0
682942,1,18048,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1519,303.8,85,17.0,2014-02-01T00:00:00.000+0000,2019-01-31T00:00:00.000+0000,1825.0,5,5,0,1.0,16.180996,0.0,0.0,0.0,24580.0,0.0,1821.0,364.2,0.834157760048171,0,5,1,0,0
829912,5,40317,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,684,114.0,65,10.833333333333334,2016-03-19T00:00:00.000+0000,2017-12-31T00:00:00.000+0000,652.0,6,5,1,0.8333333333333334,14.713593333333334,0.0,787.11578,787.11578,9108.8,0.0,647.0,107.83333333333331,2.812704198909744,0,6,1,1,1
965578,5,45860,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1216,304.0,55,13.75,2014-12-19T00:00:00.000+0000,2018-12-17T00:00:00.000+0000,1459.0,4,4,0,1.0,15.62171,0.0,386.6,386.6,18996.0,0.0,1456.0,364.0,0.8351648351648352,0,4,1,0,0
75070,5,43993,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,912,304.0,45,15.0,2015-01-02T00:00:00.000+0000,2017-06-29T00:00:00.000+0000,909.0,3,3,0,1.0,16.29386,0.0,0.0,0.0,14860.0,0.0,907.0,302.3333333333333,1.1205064295973386,0,3,1,1,1
107896,5,46836,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1216,304.0,55,13.75,2015-01-02T00:00:00.000+0000,2018-12-30T00:00:00.000+0000,1458.0,4,4,0,1.0,16.61184,0.0,0.0,0.0,20200.0,0.0,1455.0,363.75,0.8357400175581995,0,4,1,0,0
158050,5,41138,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1216,304.0,55,13.75,2015-01-02T00:00:00.000+0000,2018-01-20T00:00:00.000+0000,1114.0,4,3,1,0.75,16.575655,0.0,0.0,0.0,14816.0,-44.0,1111.0,277.75,4.426948808766991,0,4,1,1,1
