# Silver Stage ! Converting , Cleaning & having the Data ready

## Payments Delta

### Read Data

In [0]:
from pyspark.sql.functions import isnan, count, when, col, to_date, to_timestamp
from pyspark.sql.types import BooleanType

In [0]:
df_bronze_payments = spark.read.format("delta") \
                    .load("/delta/bronze_payments")

### Convert

In [0]:
df_bronze_payments.display()
df_bronze_payments.printSchema()
df_bronze_payments.summary().show()

_c0,payment_id,date,amount,account_number
0,1,2019-05-01,9.0,1329
1,2,2019-06-01,9.0,1329
2,3,2019-07-01,9.0,1329
3,4,2019-08-01,9.0,1329
4,5,2019-09-01,9.0,1329
5,6,2019-10-01,9.0,1329
6,7,2019-11-01,9.0,1329
7,8,2019-12-01,9.0,1329
8,9,2020-01-01,9.0,1329
9,10,2020-02-01,9.0,1329


root
 |-- _c0: string (nullable = true)
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- account_number: string (nullable = true)

+-------+----------------+----------------+----------+-----------------+-----------------+
|summary|             _c0|      payment_id|      date|           amount|   account_number|
+-------+----------------+----------------+----------+-----------------+-----------------+
|  count|          232233|          232233|    232233|           232233|           232233|
|   mean|        116116.0|        116117.0|      null|9.868401045501647|5474.312961551545|
| stddev|67040.0368697691|67040.0368697691|      null|3.039150125780444|2567.038056181679|
|    min|               0|               1|2013-02-01|             10.0|             1000|
|    25%|         58035.0|         58036.0|      null|              9.0|           3242.0|
|    50%|        116098.0|        116099.0|      null|              9

In [0]:
# drop the _c0 column
df_bronze_payments = df_bronze_payments.drop('_c0')

# rename 
df_bronze_payments = df_bronze_payments.withColumnRenamed('account_number', 'rider_id')

In [0]:
df_bronze_payments.display()
df_bronze_payments.printSchema()
df_bronze_payments.summary().show()

payment_id,date,amount,rider_id
1,2019-05-01,9.0,1329
2,2019-06-01,9.0,1329
3,2019-07-01,9.0,1329
4,2019-08-01,9.0,1329
5,2019-09-01,9.0,1329
6,2019-10-01,9.0,1329
7,2019-11-01,9.0,1329
8,2019-12-01,9.0,1329
9,2020-01-01,9.0,1329
10,2020-02-01,9.0,1329


root
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- rider_id: string (nullable = true)

+-------+----------------+----------+-----------------+-----------------+
|summary|      payment_id|      date|           amount|         rider_id|
+-------+----------------+----------+-----------------+-----------------+
|  count|          232233|    232233|           232233|           232233|
|   mean|        116117.0|      null|9.868401045501647|5474.312961551545|
| stddev|67040.0368697691|      null|3.039150125780444|2567.038056181679|
|    min|               1|2013-02-01|             10.0|             1000|
|    25%|         58036.0|      null|              9.0|           3242.0|
|    50%|        116099.0|      null|              9.0|           5490.0|
|    75%|        174177.0|      null|              9.0|           7645.0|
|    max|           99999|2022-02-01|             9.99|             9927|
+-------+---------------

In [0]:
# Check for missing values
missing_values = df_bronze_payments.select([count(when(isnan(column) | col(column).isNull(), column)).alias(column) for column in df_bronze_payments.columns]).collect()
num_missing = sum([row for row in missing_values[0]])
if num_missing > 0:
    print(f'Missing values for data set?: True')
    print(f'Missing values count: {missing_values[0]}')
    # Display a sample of missing values
    for column in df_bronze_payments.columns:
        missing_records = df_bronze_payments.filter(col(column).isNull() | isnan(col(column)))
        print(f"Sample of missing values for column '{column}':")
        missing_records.show(5, truncate=False)
else:
    print(f'Missing values for data set?: False')

# Check for duplicates
num_duplicates = df_bronze_payments.count() - df_bronze_payments.dropDuplicates(df_bronze_payments.columns).count()

if num_duplicates > 0:
    print(f'Duplicates values for data set?: True')
    print(f'Duplicates count: {num_duplicates}')
    # Display a sample of duplicated values
    duplicate_records = df_bronze_payments.groupBy(df_bronze_payments.columns).count().filter('count > 1')
    print("Sample of duplicated values:")
    duplicate_records.show(5, truncate=False)
else:
    print(f'Duplicates values for data set?: False')


Missing values for data set?: False
Duplicates values for data set?: False


#### Change Columns type to desired format

In [0]:
print(df_bronze_payments.printSchema())

root
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- rider_id: string (nullable = true)

None


In [0]:
df_bronze_payments = df_bronze_payments.withColumn("date", to_date("date", "yyyy-MM-dd"))
df_bronze_payments = df_bronze_payments.withColumn("amount", col("amount").cast("decimal(10,2)"))
df_bronze_payments = df_bronze_payments.withColumn("payment_id", col("payment_id").cast("integer"))
df_bronze_payments = df_bronze_payments.withColumn("rider_id", col("rider_id").cast("integer"))

In [0]:
print(df_bronze_payments.printSchema())

root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- rider_id: integer (nullable = true)

None


### Write to Tables

In [0]:
df_bronze_payments.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_payments")

## Trips Delta

### Read Data

In [0]:
df_bronze_trips = spark.read.format("delta") \
                .load("/delta/bronze_trips")

### Convert

In [0]:
df_bronze_trips.display()
df_bronze_trips.printSchema()
df_bronze_trips.summary().show()

_c0,ride_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,rider
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,525,RP-007,3516
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,525,RP-007,10759
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,TA1306000016,TA1307000001,2079
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,KA1504000151,TA1309000021,2480
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,TA1309000002,TA1305000029,10894
5,A39C6F6CC0586C0B,classic_bike,2022-01-11 18:48:09,2022-01-11 18:51:31,637,TA1305000034,5335
6,BDC4AB637EDF981B,classic_bike,2022-01-30 18:32:52,2022-01-30 18:49:26,KA1504000158,13323,2583
7,81751A3186E59A6B,classic_bike,2022-01-22 12:20:02,2022-01-22 12:32:06,TA1306000016,13271,6906
8,154222B86A338ABD,electric_bike,2022-01-17 07:34:41,2022-01-17 08:00:08,13304,WL-012,5394
9,72DC25B2DD467EEF,classic_bike,2022-01-28 15:27:53,2022-01-28 15:35:16,TA1309000004,WL-012,1563


root
 |-- _c0: string (nullable = true)
 |-- ride_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider: string (nullable = true)

+-------+------------------+--------------------+-------------+-------------------+-------------------+------------------+------------------+-----------------+
|summary|               _c0|             ride_id|rideable_type|         started_at|           ended_at|  start_station_id|    end_station_id|            rider|
+-------+------------------+--------------------+-------------+-------------------+-------------------+------------------+------------------+-----------------+
|  count|             80128|               80128|        80128|              80128|              80128|             80128|             80128|            80128|
|   mean| 43289.719973

In [0]:
# drop the _c0 column
df_bronze_trips = df_bronze_trips.drop('_c0')

# rename 
df_bronze_trips = df_bronze_trips.withColumnRenamed('ride_id', 'trip_id')
df_bronze_trips = df_bronze_trips.withColumnRenamed('rider', 'rider_id')

In [0]:
df_bronze_trips.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: string (nullable = true)



In [0]:
# Check for missing values
missing_values = df_bronze_trips.select([count(when(isnan(column) | col(column).isNull(), column)).alias(column) for column in df_bronze_trips.columns]).collect()
num_missing = sum([row for row in missing_values[0]])
if num_missing > 0:
    print(f'Missing values for data set?: True')
    print(f'Missing values count: {missing_values[0]}')
    # Display a sample of missing values
    for column in df_bronze_trips.columns:
        missing_records = df_bronze_trips.filter(col(column).isNull() | isnan(col(column)))
        print(f"Sample of missing values for column '{column}':")
        missing_records.show(5, truncate=False)
else:
    print(f'Missing values for data set?: False')

# Check for duplicates
num_duplicates = df_bronze_trips.count() - df_bronze_trips.dropDuplicates(df_bronze_trips.columns).count()

if num_duplicates > 0:
    print(f'Duplicates values for data set?: True')
    print(f'Duplicates count: {num_duplicates}')
    # Display a sample of duplicated values
    duplicate_records = df_bronze_trips.groupBy(df_bronze_trips.columns).count().filter('count > 1')
    print("Sample of duplicated values:")
    duplicate_records.show(5, truncate=False)
else:
    print(f'Duplicates values for data set?: False')

Missing values for data set?: False
Duplicates values for data set?: False


#### Change Columns type to the desired format

In [0]:
df_bronze_trips.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: string (nullable = true)



In [0]:
df_bronze_trips = df_bronze_trips.withColumn("started_at", to_timestamp("started_at", "yyyy-MM-dd HH:mm:ss"))
df_bronze_trips = df_bronze_trips.withColumn("ended_at", to_timestamp("ended_at", "yyyy-MM-dd HH:mm:ss"))
df_bronze_trips = df_bronze_trips.withColumn("rider_id", col("rider_id").cast("integer"))

In [0]:
df_bronze_trips.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: integer (nullable = true)



### Write to Tables

In [0]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
df_bronze_trips.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_trips")

## Riders Delta

### Read Data

In [0]:
df_bronze_riders = spark.read.format("delta") \
                  .load("/delta/bronze_riders")

### Convert

In [0]:
df_bronze_riders.display()
df_bronze_riders.printSchema()
df_bronze_riders.summary().show()

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
1000,Kimberly,Williams,1200 Alyssa Squares,1988-03-28,2019-04-23,,True
1001,Anthony,Erickson,397 Diana Ferry,1976-12-04,2019-11-01,2020-09-01,True
1002,Jessica,Roach,644 Brittany Row Apt. 097,1998-03-28,2022-02-04,,True
1003,Andrew,Ryan,996 Dickerson Turnpike,1999-03-05,2019-08-26,,False
1004,Ian,Peters,7009 Nathan Expressway,1969-06-25,2019-09-14,,True
1005,Michael,Gillespie,224 Washington Mills Apt. 467,1974-09-28,2020-03-24,,False
1006,Ryan,Peters,1137 Angela Locks,2003-07-10,2020-11-27,2021-06-01,True
1007,Crystal,Sanchez,979 Phillips Ways,1987-10-15,2016-12-11,,False
1008,David,Hicks,7691 Evans Court,1986-07-12,2021-03-28,2021-04-01,True
1009,Daniel,Hicks,9922 Jim Crest Apt. 319,1981-02-14,2020-06-12,2021-02-01,True


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

+-------+------------------+---------+------+--------------------+----------+----------+----------+-----+
|summary|               _c0|      _c1|   _c2|                 _c3|       _c4|       _c5|       _c6|  _c7|
+-------+------------------+---------+------+--------------------+----------+----------+----------+-----+
|  count|                50|       50|    50|                  50|        50|        50|        11|   50|
|   mean|            1024.5|     null|  null|                null|      null|      null|      null| null|
| stddev|14.577379737113251|     null|  null|                null|      null|      null|      null| null|
|    min|              1000|Alexandra|Acosta|00348 Brandi Park...|1969-06-25|2

In [0]:
# rename 
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c0', 'rider_id')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c1', 'first')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c2', 'last')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c3', 'address')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c4', 'birthday')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c5', 'account_start_date')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c6', 'account_end_date')
df_bronze_riders = df_bronze_riders.withColumnRenamed('_c7', 'is_member')

In [0]:
df_bronze_riders.printSchema()

root
 |-- rider_id: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- account_start_date: string (nullable = true)
 |-- account_end_date: string (nullable = true)
 |-- is_member: string (nullable = true)



In [0]:
# Check for missing values
missing_values = df_bronze_riders.select([count(when(isnan(column) | col(column).isNull(), column)).alias(column) for column in df_bronze_riders.columns]).collect()
num_missing = sum([row for row in missing_values[0]])
if num_missing > 0:
    print(f'Missing values for data set?: True')
    print(f'Missing values count: {missing_values[0]}')
    # Display a sample of missing values
    for column in df_bronze_riders.columns:
        missing_records = df_bronze_riders.filter(col(column).isNull() | isnan(col(column)))
        print(f"Sample of missing values for column '{column}':")
        missing_records.show(5, truncate=False)
else:
    print(f'Missing values for data set?: False')

# Check for duplicates
num_duplicates = df_bronze_riders.count() - df_bronze_riders.dropDuplicates(df_bronze_riders.columns).count()

if num_duplicates > 0:
    print(f'Duplicates values for data set?: True')
    print(f'Duplicates count: {num_duplicates}')
    # Display a sample of duplicated values
    duplicate_records = df_bronze_riders.groupBy(df_bronze_riders.columns).count().filter('count > 1')
    print("Sample of duplicated values:")
    duplicate_records.show(5, truncate=False)
else:
    print(f'Duplicates values for data set?: False')

Missing values for data set?: True
Missing values count: Row(rider_id=0, first=0, last=0, address=0, birthday=0, account_start_date=0, account_end_date=39, is_member=0)
Sample of missing values for column 'rider_id':
+--------+-----+----+-------+--------+------------------+----------------+---------+
|rider_id|first|last|address|birthday|account_start_date|account_end_date|is_member|
+--------+-----+----+-------+--------+------------------+----------------+---------+
+--------+-----+----+-------+--------+------------------+----------------+---------+

Sample of missing values for column 'first':
+--------+-----+----+-------+--------+------------------+----------------+---------+
|rider_id|first|last|address|birthday|account_start_date|account_end_date|is_member|
+--------+-----+----+-------+--------+------------------+----------------+---------+
+--------+-----+----+-------+--------+------------------+----------------+---------+

Sample of missing values for column 'last':
+--------+--

#### Keeping the null as it is the end date

#### Check Columns type to the desired format

In [0]:
df_bronze_riders.printSchema()

root
 |-- rider_id: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- account_start_date: string (nullable = true)
 |-- account_end_date: string (nullable = true)
 |-- is_member: string (nullable = true)



In [0]:
df_bronze_riders = df_bronze_riders.withColumn("account_end_date", to_date("account_end_date", "yyyy-MM-dd"))
df_bronze_riders = df_bronze_riders.withColumn("account_start_date", to_date("account_start_date", "yyyy-MM-dd"))
df_bronze_riders = df_bronze_riders.withColumn("birthday", to_date("birthday", "yyyy-MM-dd"))
df_bronze_riders = df_bronze_riders.withColumn("rider_id", col("rider_id").cast("integer"))
df_bronze_riders = df_bronze_riders.withColumn("is_member", col("is_member").cast(BooleanType()))

In [0]:
df_bronze_riders.printSchema()

root
 |-- rider_id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: date (nullable = true)
 |-- account_start_date: date (nullable = true)
 |-- account_end_date: date (nullable = true)
 |-- is_member: boolean (nullable = true)



### Write to Tables

In [0]:
df_bronze_riders.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_riders")

## Stations Delta

### Read Data

In [0]:
df_bronze_stations = spark.read.format("delta") \
                  .load("/delta/bronze_stations2")

### Convert

In [0]:
df_bronze_stations.display()
df_bronze_stations.printSchema()
df_bronze_stations.summary().show()

_c0,_c1,_c2,_c3
525,Glenwood Ave & Touhy Ave,42.012701,-87.66605799999999
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668
637,Wood St & Chicago Ave,41.895634,-87.672069
13216,State St & 33rd St,41.8347335,-87.6258275
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999999
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+-------+------------------+--------------------+-------------------+------------------+
|summary|               _c0|                 _c1|                _c2|               _c3|
+-------+------------------+--------------------+-------------------+------------------+
|  count|               838|                 838|                838|               838|
|   mean|10322.868421052632|                null| 41.876933543115754|-87.64742065247817|
| stddev| 14223.63291249315|                null|0.15362863855015238|0.4816417141218605|
|    min|             13001| 2112 W Peterson Ave|  41.64850076266409|      -73.79647696|
|    25%|             475.0|                null|              41.82|-87.69345045089722|
|    50%|           13136.0|                null|          41.885409|        -87.659172|
|    75%|           15664.0|                null|    

In [0]:
# rename 
df_bronze_stations = df_bronze_stations.withColumnRenamed('_c0', 'station_id')
df_bronze_stations = df_bronze_stations.withColumnRenamed('_c1', 'name')
df_bronze_stations = df_bronze_stations.withColumnRenamed('_c2', 'latitude')
df_bronze_stations = df_bronze_stations.withColumnRenamed('_c3', 'longitude')

In [0]:
df_bronze_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [0]:
# Check for missing values
missing_values = df_bronze_stations.select([count(when(isnan(column) | col(column).isNull(), column)).alias(column) for column in df_bronze_stations.columns]).collect()
num_missing = sum([row for row in missing_values[0]])
if num_missing > 0:
    print(f'Missing values for data set?: True')
    print(f'Missing values count: {missing_values[0]}')
    # Display a sample of missing values
    for column in df_bronze_stations.columns:
        missing_records = df_bronze_stations.filter(col(column).isNull() | isnan(col(column)))
        print(f"Sample of missing values for column '{column}':")
        missing_records.show(5, truncate=False)
else:
    print(f'Missing values for data set?: False')

# Check for duplicates
num_duplicates = df_bronze_stations.count() - df_bronze_stations.dropDuplicates(df_bronze_stations.columns).count()

if num_duplicates > 0:
    print(f'Duplicates values for data set?: True')
    print(f'Duplicates count: {num_duplicates}')
    # Display a sample of duplicated values
    duplicate_records = df_bronze_stations.groupBy(df_bronze_stations.columns).count().filter('count > 1')
    print("Sample of duplicated values:")
    duplicate_records.show(5, truncate=False)
else:
    print(f'Duplicates values for data set?: False')

Missing values for data set?: False
Duplicates values for data set?: False


In [0]:
df_bronze_stations.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [0]:
df_bronze_stations = df_bronze_stations.withColumn("latitude", col("latitude").cast("decimal(20,18)"))
df_bronze_stations = df_bronze_stations.withColumn("longitude", col("longitude").cast("decimal(20,18)"))

In [0]:
df_bronze_stations.printSchema()
df_bronze_stations.display()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: decimal(20,18) (nullable = true)
 |-- longitude: decimal(20,18) (nullable = true)



station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.012701,-87.66605799999998
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668
637,Wood St & Chicago Ave,41.895634,-87.672069
13216,State St & 33rd St,41.8347335,-87.6258275
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999998
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001


### Write to Tables

In [0]:
df_bronze_stations.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_stations")