### Analyze the dataset and come up with some insights that will help us to predict both requested and successful appointments
### How would you recommend that we can leverage this data to improve our products or services?

In [2]:
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.ml.feature import IndexToString, StringIndexer

In [3]:
# File location and type
file_location = "/FileStore/tables/DS_Sample_Data-c561c.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

source,campaign,region,locality,preferred_doctor_gender,dental_anxiety,last_visited_a_dentist,how_soon_they_want_to_book,reason_for_visit,patient_insured,number_of_match_results,out_network_results_shown,in_network_results_shown,avg_distance_overall,appointment_requested,appointment_status,appointment_scheduled,appointment_billed,chosen_provider_insurance_status,chosen_provider_distance
facebook,san_diego_dentist_patient_acquisition,San Jose,San Jose,No Pref,A little nervous,< 1 year,ASAP,Specific Treatment,f,9,0,0,10547.21951,f,,f,,,
facebook,san_diego_dentist_patient_acquisition,San Diego,Chula Vista,F,Not at all nervous,2+ years,2 weeks,Checkup & Cleaning,f,2,0,0,20784.61772,f,,f,,,
try.opencare.com,Unknown,Austin,Austin,No Pref,Moderately nervous,2+ years,ASAP,Checkup & Cleaning,t,9,8,1,8014.945931,t,CANCELLED,f,f,in-network,2855.594821
facebook,denver_dentist_patient_acquisition,Denver,Golden,No Pref,Not at all nervous,< 1 year,ASAP,Specific Treatment,t,8,2,6,16651.15875,f,,f,,,
facebook,chicago_dentist_patient_acquisition,Chicago,Chicago,F,Not at all nervous,< 1 year,1 week,Checkup & Cleaning,f,9,0,0,5216.934559,f,,f,,,
Direct,Unknown,San Francisco,San Francisco,No Pref,Moderately nervous,2+ years,ASAP,Urgent Issue,t,9,8,1,2150.607797,f,,f,,,
facebook,chicago_dentist_patient_acquisition,Chicago,Hanover Park,F,Not at all nervous,1 - 2 years,2 weeks,Checkup & Cleaning,t,3,2,1,21242.31032,f,,f,,,
Direct,Unknown,San Francisco,San Francisco,No Pref,Moderately nervous,2+ years,ASAP,Urgent Issue,t,9,5,4,2150.607797,f,,f,,,
www.opencare.com,Unknown,Seattle,Renton,No Pref,Not at all nervous,1 - 2 years,No preference,Checkup & Cleaning,t,9,3,6,9143.70618,f,,f,,,
adwords,search_chicago_local,Chicago,Ottawa,No Pref,A little nervous,< 1 year,ASAP,Urgent Issue,t,0,0,0,,f,,f,,,


In [4]:
display(df.groupBy("appointment_requested").count().withColumn('%' , percentage('count')))

appointment_requested,count,%
f,9148,76.66136
t,2785,23.33864


In [5]:
display(df.groupBy("appointment_billed").count().withColumn('%' , percentage('count')))

appointment_billed,count,%
,9153,76.70326
f,1574,13.190312
t,1206,10.106427


In [6]:
percentage = udf(lambda x: x*100/11933, FloatType())
nested_groupby = df.groupby('appointment_requested', 'appointment_status' , 'appointment_billed').count()
nested_groupby_percentage = nested_groupby.withColumn('%' , percentage('count'))
display(nested_groupby_percentage)

appointment_requested,appointment_status,appointment_billed,count,%
t,SCHEDULED,f,1,0.0083801225
t,CANCELLED,,1,0.0083801225
t,REQUESTED,t,34,0.28492415
f,,,9148,76.66136
t,SCHEDULED,t,8,0.06704098
t,REQUESTED,,4,0.03352049
t,COMPLETED,t,1163,9.746082
t,COMPLETED,f,295,2.472136
t,CANCELLED,f,1270,10.6427555
t,CREATED,t,1,0.0083801225


- patients who request == 77%
- patients who dont == 23%

In [8]:
target_feature_app_requested = df.groupby('appointment_requested').count().withColumn('%' , percentage('count'))
display(target_feature_app_requested)

appointment_requested,count,%
f,9148,76.66136
t,2785,23.33864


# Analysing the impact of features

# Source

In [11]:
source = df.groupby('appointment_requested', 'source').count().withColumn('%' , percentage('count'))
display(source)

appointment_requested,source,count,%
f,try.opencare.com,307,2.5726976
f,www.opencare.com,514,4.307383
f,www.webcrawler.com,1,0.0083801225
f,search.myway.com:80,1,0.0083801225
t,search.tb.ask.com,1,0.0083801225
f,patient_leads_jun_2018,62,0.5195676
f,www.avaorthodontics.com,1,0.0083801225
f,www.tedforddental.com,1,0.0083801225
f,start.att.net,1,0.0083801225
f,gmail,1,0.0083801225


In [12]:
source_requested = source.filter(col('appointment_requested')== 't').sort('%')
display(source_requested)

appointment_requested,source,count,%
t,dnserrorassist.att.net,1,0.0083801225
t,mail.google.com,1,0.0083801225
t,city_all_static_interests_all_20to40_sf_sf,1,0.0083801225
t,patient.opencare.com,1,0.0083801225
t,instagram.com,1,0.0083801225
t,direct_mail,1,0.0083801225
t,opencare.com,1,0.0083801225
t,,1,0.0083801225
t,search.tb.ask.com,1,0.0083801225
t,drip_no_card_4,2,0.016760245


In [13]:
source_not_requested = source.filter(col('appointment_requested')== 'f').sort('%')
display(source_not_requested)

appointment_requested,source,count,%
f,l.facebook.com:80,1,0.0083801225
f,egglestonoms.clickforward.com,1,0.0083801225
f,search.myway.com:80,1,0.0083801225
f,www.health247.com,1,0.0083801225
f,www.totalhealthdentalpc.com,1,0.0083801225
f,www.ltsmiles.com,1,0.0083801225
f,mckinneysmiles.com,1,0.0083801225
f,www.drloewinger.com,1,0.0083801225
f,try.opencare.com:80,1,0.0083801225
f,www.drmerey.com,1,0.0083801225


source = facebook 
- requested = 14 %
- not requested = 47 %

In [15]:
# Facebook as source in total 
print(facebook_source.groupBy().sum().collect())
print("source from facebook :", (7641/df.count())*100)

In [16]:
# Opencare as source in total 
print(opencare_source.groupBy().sum().collect())
print("source from opencare :", (1059/df.count())*100)

# Campaign

In [18]:
campaign = df.groupby('appointment_requested', 'campaign').count().withColumn('%' , percentage('count'))
display(campaign)

appointment_requested,campaign,count,%
f,search_chicago_local,273,2.2877734
f,denver_patients,1,0.0083801225
f,chicago_dentist_patient_remarketing,1,0.0083801225
f,patients,3,0.025140367
f,patient_leads_jun_2018,143,1.1983575
f,search_seattle_local,109,0.9134333
t,Unknown,639,5.354898
f,us_dentist_patient_acquisition,2,0.016760245
f,chicago_dentist_patient_acquisition,993,8.321462
f,search_chicago_metro,125,1.0475153


In [19]:
campaign_requested = campaign.filter(col('appointment_requested')== 't').sort('%')
display(campaign_requested)

appointment_requested,campaign,count,%
t,us_dentist_patient_acquisition,1,0.0083801225
t,site_link,1,0.0083801225
t,,3,0.025140367
t,toronto_dentist_patient_acquisition,5,0.041900612
t,search_austin_metro,5,0.041900612
t,sanfrancisco_dentist_patient_acquisition,5,0.041900612
t,patient_leads_feb_2018,7,0.058660857
t,us_dentist_patient_ad_testing,9,0.0754211
t,patient_leads_mar_2018,9,0.0754211
t,search_chicago_metro,11,0.09218135


In [20]:
campaign_not_requested = campaign.filter(col('appointment_requested')== 'f').sort('%')
display(campaign_not_requested)

appointment_requested,campaign,count,%
f,denver_patients,1,0.0083801225
f,Free Credits,1,0.0083801225
f,feb_2018_email_blast,1,0.0083801225
f,main_seattle,1,0.0083801225
f,chicago_dentist_patient_remarketing,1,0.0083801225
f,chicato_dentist_patient_acquisition,1,0.0083801225
f,chicago_dentist_patient_acquisition_opencare,1,0.0083801225
f,seattle_dentist_patient_acquisition_opencare,1,0.0083801225
f,seattle_dentist_patient_remarketing,1,0.0083801225
f,uber,1,0.0083801225


top 3 : austin , seatle , chicago 
- requested 2% +
- not requested 9% and 8%

# Region

In [23]:
region = df.groupby('appointment_requested', 'region').count().withColumn('%' , percentage('count'))
display(region)

appointment_requested,region,count,%
f,Chicago,1916,16.056314
t,Chicago,614,5.1453953
f,Denver,878,7.3577476
t,Seattle,597,5.002933
t,Denver,322,2.6983993
t,San Francisco,460,3.8548563
f,Seattle,1761,14.757396
t,Austin,463,3.8799965
t,San Diego,145,1.2151177
f,Washington DC,14,0.117321715


In [24]:
region_requested = region.filter(col('appointment_requested')== 't').sort('%')
display(region_requested)

appointment_requested,region,count,%
t,San Diego,145,1.2151177
t,San Jose,184,1.5419425
t,Denver,322,2.6983993
t,San Francisco,460,3.8548563
t,Austin,463,3.8799965
t,Seattle,597,5.002933
t,Chicago,614,5.1453953


In [25]:
region_not_requested = region.filter(col('appointment_requested')== 'f').sort('%')
display(region_not_requested)

appointment_requested,region,count,%
f,Washington DC,14,0.117321715
f,San Diego,474,3.972178
f,San Jose,651,5.4554596
f,Denver,878,7.3577476
f,San Francisco,1594,13.357915
f,Seattle,1761,14.757396
f,Austin,1860,15.587028
f,Chicago,1916,16.056314


top 3 = chicago, austin, seatle

# preferred_doctor_gender

In [28]:
preferred_doctor_gender = df.groupby('appointment_requested', 'preferred_doctor_gender').count().withColumn('%' , percentage('count'))
display(preferred_doctor_gender)

appointment_requested,preferred_doctor_gender,count,%
t,M,182,1.5251822
t,F,382,3.2012067
f,F,1518,12.721025
t,No Pref,2221,18.612251
f,No Pref,6854,57.43736
f,M,776,6.502975


In [29]:
preferred_doctor_gender_requested = preferred_doctor_gender.filter(col('appointment_requested')== 't').sort('%')
display(preferred_doctor_gender_requested)

appointment_requested,preferred_doctor_gender,count,%
t,M,182,1.5251822
t,F,382,3.2012067
t,No Pref,2221,18.612251


In [30]:
preferred_doctor_gender_not_requested = preferred_doctor_gender.filter(col('appointment_requested')== 'f').sort('%')
display(preferred_doctor_gender_not_requested)

appointment_requested,preferred_doctor_gender,count,%
f,M,776,6.502975
f,F,1518,12.721025
f,No Pref,6854,57.43736


# dental_anxiety

In [32]:
dental_anxiety = df.groupby('appointment_requested', 'dental_anxiety').count().withColumn('%' , percentage('count'))
display(dental_anxiety.sort("%"))

appointment_requested,dental_anxiety,count,%
t,,2,0.016760245
f,,7,0.058660857
t,Extremely nervous,167,1.3994805
t,Very nervous,205,1.7179251
t,Moderately nervous,401,3.360429
f,Extremely nervous,481,4.030839
f,Very nervous,722,6.0504484
t,A little nervous,793,6.6454372
t,Not at all nervous,1217,10.198609
f,Moderately nervous,1253,10.500294


In [33]:
dental_anxiety_requested = dental_anxiety.filter(col('appointment_requested')== 't').sort('%')
display(dental_anxiety_requested)

appointment_requested,dental_anxiety,count,%
t,,2,0.016760245
t,Extremely nervous,167,1.3994805
t,Very nervous,205,1.7179251
t,Moderately nervous,401,3.360429
t,A little nervous,793,6.6454372
t,Not at all nervous,1217,10.198609


In [34]:
dental_anxiety_not_requested = dental_anxiety.filter(col('appointment_requested')== 'f').sort('%')
display(dental_anxiety_not_requested)

appointment_requested,dental_anxiety,count,%
f,,7,0.058660857
f,Extremely nervous,481,4.030839
f,Very nervous,722,6.0504484
f,Moderately nervous,1253,10.500294
f,A little nervous,2430,20.363697
f,Not at all nervous,4255,35.65742


# last_visited_a_dentist

In [36]:
last_visited_a_dentist = df.groupby('appointment_requested', 'last_visited_a_dentist').count().withColumn('%' , percentage('count'))
display(last_visited_a_dentist.sort("%"))

appointment_requested,last_visited_a_dentist,count,%
t,,2,0.016760245
f,,8,0.06704098
t,Never,43,0.36034527
f,Never,256,2.1453114
t,2+ years,752,6.301852
t,1 - 2 years,926,7.759993
t,< 1 year,1062,8.89969
f,2+ years,2028,16.994888
f,1 - 2 years,2499,20.941925
f,< 1 year,4357,36.51219


In [37]:
last_visited_a_dentist_requested = last_visited_a_dentist.filter(col('appointment_requested')== 't').sort('%')
display(last_visited_a_dentist_requested)

appointment_requested,last_visited_a_dentist,count,%
t,,2,0.016760245
t,Never,43,0.36034527
t,2+ years,752,6.301852
t,1 - 2 years,926,7.759993
t,< 1 year,1062,8.89969


In [38]:
last_visited_a_dentist_not_requested = last_visited_a_dentist.filter(col('appointment_requested')== 'f').sort('%')
display(last_visited_a_dentist_not_requested)

appointment_requested,last_visited_a_dentist,count,%
f,,8,0.06704098
f,Never,256,2.1453114
f,2+ years,2028,16.994888
f,1 - 2 years,2499,20.941925
f,< 1 year,4357,36.51219


# how_soon_they_want_to_book

In [40]:
how_soon_they_want_to_book = df.groupby('appointment_requested', 'how_soon_they_want_to_book').count().withColumn('%' , percentage('count'))
display(how_soon_they_want_to_book.sort("%"))

appointment_requested,how_soon_they_want_to_book,count,%
t,,2,0.016760245
f,,8,0.06704098
t,No preference,310,2.597838
t,More than 2 weeks,372,3.1174054
t,1 week,384,3.217967
t,2 weeks,638,5.346518
t,ASAP,1079,9.042152
f,1 week,1228,10.290791
f,No preference,1436,12.033855
f,2 weeks,1859,15.578648


In [41]:
how_soon_they_want_to_book_requested = how_soon_they_want_to_book.filter(col('appointment_requested')== 't').sort('%')
display(how_soon_they_want_to_book_requested)

appointment_requested,how_soon_they_want_to_book,count,%
t,,2,0.016760245
t,No preference,310,2.597838
t,More than 2 weeks,372,3.1174054
t,1 week,384,3.217967
t,2 weeks,638,5.346518
t,ASAP,1079,9.042152


In [42]:
how_soon_they_want_to_book_not_requested = how_soon_they_want_to_book.filter(col('appointment_requested')== 'f').sort('%')
display(how_soon_they_want_to_book_not_requested)

appointment_requested,how_soon_they_want_to_book,count,%
f,,8,0.06704098
f,1 week,1228,10.290791
f,No preference,1436,12.033855
f,2 weeks,1859,15.578648
f,More than 2 weeks,1863,15.612168
f,ASAP,2754,23.078857


# reason_for_visit

In [44]:
reason_for_visit = df.groupby('appointment_requested', 'reason_for_visit').count().withColumn('%' , percentage('count'))
display(reason_for_visit.sort("%"))

appointment_requested,reason_for_visit,count,%
t,,1,0.0083801225
f,,7,0.058660857
t,Urgent Issue,252,2.111791
t,Specific Treatment,262,2.1955922
f,Urgent Issue,962,8.061678
f,Specific Treatment,1194,10.005866
t,Checkup & Cleaning,2270,19.022879
f,Checkup & Cleaning,6985,58.535156


In [45]:
reason_for_visit_requested = reason_for_visit.filter(col('appointment_requested')== 't').sort('%')
display(reason_for_visit_requested)

appointment_requested,reason_for_visit,count,%
t,,1,0.0083801225
t,Urgent Issue,252,2.111791
t,Specific Treatment,262,2.1955922
t,Checkup & Cleaning,2270,19.022879


In [46]:
reason_for_visit_not_requested = reason_for_visit.filter(col('appointment_requested')== 'f').sort('%')
display(reason_for_visit_not_requested)

appointment_requested,reason_for_visit,count,%
f,,7,0.058660857
f,Urgent Issue,962,8.061678
f,Specific Treatment,1194,10.005866
f,Checkup & Cleaning,6985,58.535156


# patient_insured

In [48]:
patient_insured = patient_insured = df.groupby('appointment_requested', 'patient_insured').count().withColumn('%' , percentage('count'))
display(patient_insured.sort("%"))

appointment_requested,patient_insured,count,%
t,f,530,4.441465
t,t,2255,18.897177
f,f,2502,20.967066
f,t,6646,55.694294


In [49]:
patient_insured_requested = patient_insured.filter(col('appointment_requested')== 't').sort('%')
display(patient_insured_requested)

appointment_requested,patient_insured,count,%
t,f,530,4.441465
t,t,2255,18.897177


In [50]:
530*100/2780

In [51]:
patient_insured_not_requested = patient_insured.filter(col('appointment_requested')== 'f').sort('%')
display(patient_insured_not_requested)

appointment_requested,patient_insured,count,%
f,f,2502,20.967066
f,t,6646,55.694294


# out_network_results_shown

In [53]:
out_network_results_shown = df.groupby('appointment_requested', 'out_network_results_shown').count().withColumn('%' , percentage('count'))
display(out_network_results_shown.sort("%"))

appointment_requested,out_network_results_shown,count,%
t,9,75,0.62850916
t,8,116,0.9720942
t,7,118,0.9888544
t,6,156,1.3072991
t,5,184,1.5419425
t,4,217,1.8184866
t,3,262,2.1955922
t,2,284,2.3799548
t,1,307,2.5726976
f,9,317,2.6564987


In [54]:
out_network_results_shown_requested = out_network_results_shown.filter(col('appointment_requested')== 't').sort('%')
display(out_network_results_shown_requested)

appointment_requested,out_network_results_shown,count,%
t,9,75,0.62850916
t,8,116,0.9720942
t,7,118,0.9888544
t,6,156,1.3072991
t,5,184,1.5419425
t,4,217,1.8184866
t,3,262,2.1955922
t,2,284,2.3799548
t,1,307,2.5726976
t,0,1066,8.93321


In [55]:
out_network_results_shown_not_requested = out_network_results_shown.filter(col('appointment_requested')== 'f').sort('%')
display(out_network_results_shown_not_requested)

appointment_requested,out_network_results_shown,count,%
f,9,317,2.6564987
f,8,345,2.8911421
f,7,397,3.3269086
f,6,443,3.7123942
f,5,514,4.307383
f,4,599,5.0196934
f,3,712,5.966647
f,2,835,6.997402
f,1,918,7.692952
f,0,4068,34.090336


# in_network_results_shown

In [57]:
in_network_results_shown = df.groupby('appointment_requested', 'in_network_results_shown').count().withColumn('%' , percentage('count'))
display(in_network_results_shown.sort("%"))

appointment_requested,in_network_results_shown,count,%
t,8,108,0.9050532
t,2,189,1.5838431
t,5,196,1.642504
t,3,218,1.8268666
t,6,219,1.8352468
t,7,220,1.843627
t,4,224,1.8771474
t,1,257,2.1536915
f,8,278,2.329674
f,5,445,3.7291543


In [58]:
in_network_results_shown_requested = in_network_results_shown.filter(col('appointment_requested')== 't').sort('%')
display(in_network_results_shown_requested)

appointment_requested,in_network_results_shown,count,%
t,8,108,0.9050532
t,2,189,1.5838431
t,5,196,1.642504
t,3,218,1.8268666
t,6,219,1.8352468
t,7,220,1.843627
t,4,224,1.8771474
t,1,257,2.1536915
t,9,455,3.8129556
t,0,699,5.8577056


In [59]:
in_network_results_shown_not_requested = in_network_results_shown.filter(col('appointment_requested')== 'f').sort('%')
display(in_network_results_shown_not_requested)

appointment_requested,in_network_results_shown,count,%
f,8,278,2.329674
f,5,445,3.7291543
f,6,533,4.466605
f,7,539,4.5168858
f,4,582,4.877231
f,3,635,5.3213778
f,2,658,5.5141206
f,1,832,6.972262
f,9,896,7.5085897
f,0,3750,31.425459


# appointment_status

In [61]:
appointment_status = df.groupby('appointment_requested', 'appointment_status').count().withColumn('%' , percentage('count'))
display(appointment_status.sort("%"))

appointment_requested,appointment_status,count,%
t,CREATED,1,0.0083801225
t,SCHEDULED,9,0.0754211
t,REQUESTED,46,0.38548562
t,CANCELLED,1271,10.651135
t,COMPLETED,1458,12.218219
f,,9148,76.66136


In [62]:
appointment_status_requested = appointment_status.filter(col('appointment_requested')== 't').sort('%')
display(appointment_status_requested)

appointment_requested,appointment_status,count,%
t,CREATED,1,0.0083801225
t,SCHEDULED,9,0.0754211
t,REQUESTED,46,0.38548562
t,CANCELLED,1271,10.651135
t,COMPLETED,1458,12.218219


In [63]:
appointment_status_not_requested = appointment_status.filter(col('appointment_requested')== 'f').sort('%')
display(appointment_status_not_requested)

appointment_requested,appointment_status,count,%
f,,9148,76.66136


## Exploration on target variables [requested and billed]

In [65]:
t_t = nested_groupby_percentage.filter((col('appointment_requested') == 't') & (col('appointment_billed') == 't'))
display(t_t)

appointment_requested,appointment_status,appointment_billed,count,%
t,REQUESTED,t,34,0.28492415
t,SCHEDULED,t,8,0.06704098
t,COMPLETED,t,1163,9.746082
t,CREATED,t,1,0.0083801225


In [66]:
t_f = nested_groupby_percentage.filter((col('appointment_requested') == 't') & (col('appointment_billed') == 'f'))
display(t_f)

appointment_requested,appointment_status,appointment_billed,count,%
t,SCHEDULED,f,1,0.0083801225
t,COMPLETED,f,295,2.472136
t,CANCELLED,f,1270,10.6427555
t,REQUESTED,f,8,0.06704098


In [67]:
f_t = nested_groupby_percentage.filter((col('appointment_requested') == 'f') & (col('appointment_billed') == 't'))
display(f_t)

appointment_requested,appointment_status,appointment_billed,count,%


In [68]:
f_f = nested_groupby_percentage.filter((col('appointment_requested') == 'f') & (col('appointment_billed') == 'f'))
display(f_f)

appointment_requested,appointment_status,appointment_billed,count,%


In [69]:
null_billed = nested_groupby_percentage.filter(
  ((col('appointment_requested') == 't') |  (col('appointment_requested') == 'f')) &
  (col('appointment_billed').isNull())
)
display(null_billed)

appointment_requested,appointment_status,appointment_billed,count,%
t,CANCELLED,,1,0.0083801225
f,,,9148,76.66136
t,REQUESTED,,4,0.03352049


In [70]:
not_null_billed = nested_groupby_percentage.filter(
  ((col('appointment_requested') == 't') |  (col('appointment_requested') == 'f')) &
  (col('appointment_billed').isNotNull())
)
display(not_null_billed)

appointment_requested,appointment_status,appointment_billed,count,%
t,SCHEDULED,f,1,0.0083801225
t,REQUESTED,t,34,0.28492415
t,SCHEDULED,t,8,0.06704098
t,COMPLETED,t,1163,9.746082
t,COMPLETED,f,295,2.472136
t,CANCELLED,f,1270,10.6427555
t,CREATED,t,1,0.0083801225
t,REQUESTED,f,8,0.06704098


In [71]:
print(df.count())
df_no_null = df.dropna()
print(df_no_null.count())

In [72]:
facebook_source = df.select('source').filter(col('source').like('%facebook%')).groupby('source').count()
display(facebook_source)

source,count
lm.facebook.com,60
l.facebook.com,35
l.facebook.com:80,1
www.facebook.com,104
l.facebook.com:443,1
m.facebook.com,90
facebook,7350


In [73]:
opencare_source = df.select('source').filter(col('source').like('%opencare%')).groupby('source').count()
display(opencare_source)

source,count
try.opencare.com,407
patient.opencare.com,3
try.opencare.com:80,1
www.opencare.com,633
opencare.com,1
app.opencare.com,4
www.opencare.com:443,10


In [74]:
df.groupBy('appointment_requested').count().withColumn('%' , percentage('count')).show()