## Spark Tutorial By krish Naik
https://www.youtube.com/watch?v=WyZmM6K7ubc&list=PLZoTAELRMXVNjiiawhzZ0afHcPvC8jpcg&index=1&ab_channel=KrishNaik

https://github.com/kevinschaich/pyspark-cheatsheet

# Day 1

In [1]:
import pyspark
import pandas as pd

#### Data Loding 

In [2]:
# Starting Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [3]:
spark

#### Read Data using Pandas

In [4]:
pdf = pd.read_csv("shuttle_service_frq.csv")
pdf.head()

Unnamed: 0,Line No.,SHUTTLE NO.,Jan,Feb,Mar
0,SDL3,3,1,0,0
1,SDL2,26,0,0,0
2,SDL2,27,0,0,1
3,SDL3,28,0,0,1
4,SDL1,29,1,0,0


#### Read Data using Spark

In [5]:
sdf = spark.read.csv("shuttle_service_frq.csv")

In [6]:
sdf

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [7]:
sdf.head()

Row(_c0='Line No.', _c1='SHUTTLE NO.', _c2='Jan', _c3='Feb', _c4='Mar')

In [8]:
sdf.show()

+--------+-----------+---+---+---+
|     _c0|        _c1|_c2|_c3|_c4|
+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
|    SDL3|         28|  0|  0|  1|
|    SDL1|         29|  1|  0|  0|
|    SDL3|         30|  1|  0|  0|
|    SDL2|         31|  0|  0|  0|
|    SDL2|         32|  3|  0|  1|
|    SDL3|         33|  0|  1|  0|
|    SDL3|         34|  0|  0|  0|
|    SDL4|         35|  1|  0|  1|
|    SDL3|         36|  0|  1|  0|
|    SDL1|         37|  0|  0|  0|
|    SDL1|         38|  0|  0|  0|
|    SDL3|         39|  0|  0|  0|
|    SDL1|         40|  0|  0|  0|
|    SDL3|         41|  0|  0|  0|
|    SDL3|         42|  0|  0|  1|
|    SDL4|         43|  0|  0|  1|
+--------+-----------+---+---+---+
only showing top 20 rows



In [9]:
# sdf = spark.read.csv("shuttle_service_frq.csv")
sdf = spark.read.option('header', 'true').csv("shuttle_service_frq.csv")
sdf

DataFrame[Line No.: string, SHUTTLE NO.: string, Jan: string, Feb: string, Mar: string]

In [10]:
sdf.show()

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
|    SDL3|         28|  0|  0|  1|
|    SDL1|         29|  1|  0|  0|
|    SDL3|         30|  1|  0|  0|
|    SDL2|         31|  0|  0|  0|
|    SDL2|         32|  3|  0|  1|
|    SDL3|         33|  0|  1|  0|
|    SDL3|         34|  0|  0|  0|
|    SDL4|         35|  1|  0|  1|
|    SDL3|         36|  0|  1|  0|
|    SDL1|         37|  0|  0|  0|
|    SDL1|         38|  0|  0|  0|
|    SDL3|         39|  0|  0|  0|
|    SDL1|         40|  0|  0|  0|
|    SDL3|         41|  0|  0|  0|
|    SDL3|         42|  0|  0|  1|
|    SDL4|         43|  0|  0|  1|
|    SDL3|         44|  0|  0|  0|
+--------+-----------+---+---+---+
only showing top 20 rows



In [11]:
sdf.head(3)

[Row(Line No.='SDL3', SHUTTLE NO.='3', Jan='1', Feb='0', Mar='0'),
 Row(Line No.='SDL2', SHUTTLE NO.='26', Jan='0', Feb='0', Mar='0'),
 Row(Line No.='SDL2', SHUTTLE NO.='27', Jan='0', Feb='0', Mar='1')]

In [12]:
# Type of sdf
type(sdf)

pyspark.sql.dataframe.DataFrame

In [13]:
# Print Schema 
sdf.printSchema()

root
 |-- Line No.: string (nullable = true)
 |-- SHUTTLE NO.: string (nullable = true)
 |-- Jan: string (nullable = true)
 |-- Feb: string (nullable = true)
 |-- Mar: string (nullable = true)



# Day 2

In [14]:
# Start spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dataframe").getOrCreate()
spark

In [15]:
# Read Dataset
sdf = spark.read.option('header', 'true').csv('shuttle_service_frq.csv') # take all columns as string
sdf

DataFrame[Line No.: string, SHUTTLE NO.: string, Jan: string, Feb: string, Mar: string]

In [16]:
sdf.show(5)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
|    SDL3|         28|  0|  0|  1|
|    SDL1|         29|  1|  0|  0|
+--------+-----------+---+---+---+
only showing top 5 rows



In [17]:
sdf.printSchema()

root
 |-- Line No.: string (nullable = true)
 |-- SHUTTLE NO.: string (nullable = true)
 |-- Jan: string (nullable = true)
 |-- Feb: string (nullable = true)
 |-- Mar: string (nullable = true)



In [18]:
# Read Dataset
sdf = spark.read.option('header', 'true').csv('shuttle_service_frq.csv', inferSchema=True) 
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [19]:
sdf.printSchema()

root
 |-- Line No.: string (nullable = true)
 |-- SHUTTLE NO.: integer (nullable = true)
 |-- Jan: integer (nullable = true)
 |-- Feb: integer (nullable = true)
 |-- Mar: integer (nullable = true)



### OR

In [20]:
sdf = spark.read.csv('shuttle_service_frq.csv', header=True, inferSchema=True)
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [21]:
sdf.show(3)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
+--------+-----------+---+---+---+
only showing top 3 rows



In [22]:
sdf.printSchema()

root
 |-- Line No.: string (nullable = true)
 |-- SHUTTLE NO.: integer (nullable = true)
 |-- Jan: integer (nullable = true)
 |-- Feb: integer (nullable = true)
 |-- Mar: integer (nullable = true)



In [23]:
# Print Line No column
# column names containing spaces and special characters need to be properly escaped
# or accessed using backticks
print("sdf : ")
print(sdf.select('`Line No.`'))
print()
print("Type : ")
print(type(sdf.select('`Line No.`')))
print()
sdf.select('`Line No.`').show(3)

sdf : 
DataFrame[Line No.: string]

Type : 
<class 'pyspark.sql.dataframe.DataFrame'>

+--------+
|Line No.|
+--------+
|    SDL3|
|    SDL2|
|    SDL2|
+--------+
only showing top 3 rows



In [24]:
# Print "Jan" column
# or accessed using backticks
print("sdf : ")
print(sdf.select('Jan'))
print()
print("Type : ")
print(type(sdf.select('Jan')))
print()
sdf.select('Jan').show(3)

sdf : 
DataFrame[Jan: int]

Type : 
<class 'pyspark.sql.dataframe.DataFrame'>

+---+
|Jan|
+---+
|  1|
|  0|
|  0|
+---+
only showing top 3 rows



In [25]:
# Select multiple columns 
# or accessed using backticks
print("sdf : ")
print(sdf.select(['`Line No.`', 'Jan']))
print()
print("Type : ")
print(type(sdf.select(['`Line No.`', 'Jan'])))
print()
sdf.select(['`Line No.`', 'Jan']).show(3)

sdf : 
DataFrame[Line No.: string, Jan: int]

Type : 
<class 'pyspark.sql.dataframe.DataFrame'>

+--------+---+
|Line No.|Jan|
+--------+---+
|    SDL3|  1|
|    SDL2|  0|
|    SDL2|  0|
+--------+---+
only showing top 3 rows



##### Column Rename

In [26]:
sdf.columns 

['Line No.', 'SHUTTLE NO.', 'Jan', 'Feb', 'Mar']

In [27]:
# Rename columns to remove spaces and special characters
for column in sdf.columns:
    new_column_name = column.replace(' ', '_').replace('.', '')
    print(new_column_name)
    sdf = sdf.withColumnRenamed(column, new_column_name)
    
sdf

Line_No
SHUTTLE_NO
Jan
Feb
Mar


DataFrame[Line_No: string, SHUTTLE_NO: int, Jan: int, Feb: int, Mar: int]

In [28]:
# Describe
print(sdf.describe())
print()
sdf.describe().show(3)

DataFrame[summary: string, Line_No: string, SHUTTLE_NO: string, Jan: string, Feb: string, Mar: string]

+-------+-------+------------------+------------------+-------------------+-------------------+
|summary|Line_No|        SHUTTLE_NO|               Jan|                Feb|                Mar|
+-------+-------+------------------+------------------+-------------------+-------------------+
|  count|    132|               134|               134|                134|                134|
|   mean|   NULL|121.73134328358209|0.4253731343283582|0.14925373134328357|0.41044776119402987|
| stddev|   NULL|136.46003376345536|0.8076408201115163| 0.3781128980976144| 0.6740371753782891|
+-------+-------+------------------+------------------+-------------------+-------------------+
only showing top 3 rows



#### Adding Column

In [29]:
add = sdf.withColumn('April', sdf['Mar']+2)
add.show(5)

+-------+----------+---+---+---+-----+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|April|
+-------+----------+---+---+---+-----+
|   SDL3|         3|  1|  0|  0|    2|
|   SDL2|        26|  0|  0|  0|    2|
|   SDL2|        27|  0|  0|  1|    3|
|   SDL3|        28|  0|  0|  1|    3|
|   SDL1|        29|  1|  0|  0|    2|
+-------+----------+---+---+---+-----+
only showing top 5 rows



In [30]:
drop = add.drop('April')
drop.show(5)

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
+-------+----------+---+---+---+
only showing top 5 rows



# Day 3

In [31]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practice').getOrCreate()
spark

In [32]:
sdf = spark.read.csv('shuttle_service_frq.csv', header=True, inferSchema=True)
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [33]:
sdf.show(3)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
+--------+-----------+---+---+---+
only showing top 3 rows



In [34]:
# Rename columns to remove spaces and special characters
for column in sdf.columns:
    new_column_name = column.replace(' ', '_').replace('.', '')
    print(new_column_name)
    sdf = sdf.withColumnRenamed(column, new_column_name)


Line_No
SHUTTLE_NO
Jan
Feb
Mar


In [35]:
for col in sdf.columns:
    renamed_col = col.replace(' ', '_').replace('.', '')
    print(renamed_col)
    sdf = sdf.withColumnRenamed(col, renamed_col)

print()
sdf

Line_No
SHUTTLE_NO
Jan
Feb
Mar



DataFrame[Line_No: string, SHUTTLE_NO: int, Jan: int, Feb: int, Mar: int]

#### Null Values Droping

In [36]:
# Drop Null rows
sdf.na.drop(how='any').show()   # Drop if any NUll is there
sdf.na.drop(how='all').show()   # Drop if all are NUll there
sdf.na.drop(how='any', thresh=2).show()   # Drop if alteaset 2 null.
sdf.na.drop(how='any',subset='Jan').show()   # Drop if null in 'Jan' Column 

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
|   SDL3|        30|  1|  0|  0|
|   SDL2|        31|  0|  0|  0|
|   SDL2|        32|  3|  0|  1|
|   SDL3|        33|  0|  1|  0|
|   SDL3|        34|  0|  0|  0|
|   SDL4|        35|  1|  0|  1|
|   SDL3|        36|  0|  1|  0|
|   SDL1|        37|  0|  0|  0|
|   SDL1|        38|  0|  0|  0|
|   SDL3|        39|  0|  0|  0|
|   SDL1|        40|  0|  0|  0|
|   SDL3|        41|  0|  0|  0|
|   SDL3|        42|  0|  0|  1|
|   SDL4|        43|  0|  0|  1|
|   SDL3|        44|  0|  0|  0|
+-------+----------+---+---+---+
only showing top 20 rows

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        

#### Null Values filling

In [37]:
sdf.na.fill("Missing Values").show(5)   # Fillna using "Missing Valuues in all df
sdf.na.fill("Missing Values", ['Jan', 'Feb']).show(5)   # Fillna using "Missing Valuues in ['Jan', 'Feb'] columns
sdf.na.fill("Missing Values", ['Jan', 'Feb']).show(5)   # Fillna using mean9

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
+-------+----------+---+---+---+
only showing top 5 rows

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
+-------+----------+---+---+---+
only showing top 5 rows

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
+-------+----------+---+---+---+
only showing top 5 rows



In [38]:
from pyspark.ml.feature import Imputer

# Initialize the Imputer
imputer = Imputer(
    inputCols  = ['Jan', 'Feb', 'Mar'],
    outputCols = ["{}_imputed".format(c) for c in ['Jan', 'Feb', 'Mar']]
).setStrategy('mean')

# Fit and transform the DataFrame
sdf_imputed = imputer.fit(sdf).transform(sdf)

In [39]:
# Show the result
sdf_imputed.show()

+-------+----------+---+---+---+-----------+-----------+-----------+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|Jan_imputed|Feb_imputed|Mar_imputed|
+-------+----------+---+---+---+-----------+-----------+-----------+
|   SDL3|         3|  1|  0|  0|          1|          0|          0|
|   SDL2|        26|  0|  0|  0|          0|          0|          0|
|   SDL2|        27|  0|  0|  1|          0|          0|          1|
|   SDL3|        28|  0|  0|  1|          0|          0|          1|
|   SDL1|        29|  1|  0|  0|          1|          0|          0|
|   SDL3|        30|  1|  0|  0|          1|          0|          0|
|   SDL2|        31|  0|  0|  0|          0|          0|          0|
|   SDL2|        32|  3|  0|  1|          3|          0|          1|
|   SDL3|        33|  0|  1|  0|          0|          1|          0|
|   SDL3|        34|  0|  0|  0|          0|          0|          0|
|   SDL4|        35|  1|  0|  1|          1|          0|          1|
|   SDL3|        36|  0|  1|  0|  

# Day 4

In [43]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Practice").getOrCreate()
spark

In [44]:
sdf = spark.read.csv('shuttle_service_frq.csv', header=True, inferSchema=True)
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [46]:
sdf.show(3)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
+--------+-----------+---+---+---+
only showing top 3 rows



#### Filter Operation

In [51]:
# Jan >= 2
# sdf.filter('Jan>=2').show()
sdf.filter('Jan>=2').show()
# sdf.filter('Jan>=2').select(['jan', 'Feb']).show(5)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL2|         32|  3|  0|  1|
|    SDL1|         62|  3|  0|  2|
|    SDL2|         84|  3|  0|  0|
|    SDL2|         95|  2|  0|  0|
|    SDL2|        108|  2|  0|  0|
|    SDL1|        110|  2|  0|  2|
|    SDL1|        115|  2|  1|  2|
|    SDL4|        123|  3|  0|  1|
|    SDL2|        124|  2|  0|  0|
|    SDL4|        125|  4|  0|  0|
|    SDL1|        144|  3|  0|  3|
|    SDL2|        151|  2|  0|  1|
+--------+-----------+---+---+---+



###### OR

In [50]:
sdf.filter(sdf['Jan']>=2).show()

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL2|         32|  3|  0|  1|
|    SDL1|         62|  3|  0|  2|
|    SDL2|         84|  3|  0|  0|
|    SDL2|         95|  2|  0|  0|
|    SDL2|        108|  2|  0|  0|
|    SDL1|        110|  2|  0|  2|
|    SDL1|        115|  2|  1|  2|
|    SDL4|        123|  3|  0|  1|
|    SDL2|        124|  2|  0|  0|
|    SDL4|        125|  4|  0|  0|
|    SDL1|        144|  3|  0|  3|
|    SDL2|        151|  2|  0|  1|
+--------+-----------+---+---+---+



In [62]:
sdf.filter((sdf['Jan']>=2) & (sdf['Feb']>=1)).show()

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL1|        115|  2|  1|  2|
+--------+-----------+---+---+---+



In [66]:
# Use of NOT
sdf.filter(~(sdf['Jan']>=1) & ~(sdf['Feb']>=1)).show(5)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
|    SDL3|         28|  0|  0|  1|
|    SDL2|         31|  0|  0|  0|
|    SDL3|         34|  0|  0|  0|
+--------+-----------+---+---+---+
only showing top 5 rows



# Day 5

In [68]:
from pyspark.sql import SparkSession

spark  = SparkSession.builder.appName("Practoice").getOrCreate()
spark

In [69]:
sdf = spark.read.csv('shuttle_service_frq.csv', header=True, inferSchema=True)
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [72]:
sdf.show(5)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
|    SDL3|         28|  0|  0|  1|
|    SDL1|         29|  1|  0|  0|
+--------+-----------+---+---+---+
only showing top 5 rows



#### Groupby/Aggregation Function

In [80]:
for col in sdf.columns:
    new_col = col.replace(".", "").replace(" ", "_")
    sdf = sdf.withColumnRenamed(col, new_col)

In [79]:
sdf.show(2)

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
+-------+----------+---+---+---+
only showing top 2 rows



In [83]:
sdf.groupby('Jan').sum().show()

+---+---------------+--------+--------+--------+
|Jan|sum(SHUTTLE_NO)|sum(Jan)|sum(Feb)|sum(Mar)|
+---+---------------+--------+--------+--------+
|  1|           2714|      26|       6|       9|
|  3|            445|      15|       0|       7|
|  4|            125|       4|       0|       0|
|  2|            703|      12|       1|       5|
|  0|          12325|       0|      13|      34|
+---+---------------+--------+--------+--------+



In [86]:
sdf.agg({'Jan':'sum'}).show()

+--------+
|sum(Jan)|
+--------+
|      57|
+--------+



In [89]:
sdf.groupby('Jan').min().show()
sdf.groupby('Jan').max().show()
sdf.groupby('Jan').avg().show() 

+---+---------------+--------+--------+--------+
|Jan|min(SHUTTLE_NO)|min(Jan)|min(Feb)|min(Mar)|
+---+---------------+--------+--------+--------+
|  1|              3|       1|       0|       0|
|  3|             32|       3|       0|       0|
|  4|            125|       4|       0|       0|
|  2|             95|       2|       0|       0|
|  0|             26|       0|       0|       0|
+---+---------------+--------+--------+--------+

+---+---------------+--------+--------+--------+
|Jan|max(SHUTTLE_NO)|max(Jan)|max(Feb)|max(Mar)|
+---+---------------+--------+--------+--------+
|  1|            451|       1|       1|       2|
|  3|            144|       3|       0|       3|
|  4|            125|       4|       0|       0|
|  2|            151|       2|       1|       2|
|  0|            787|       0|       2|       3|
+---+---------------+--------+--------+--------+

+---+------------------+--------+-------------------+-------------------+
|Jan|   avg(SHUTTLE_NO)|avg(Jan)|         

# Day 6

In [91]:
from pyspark.sql import SparkSession

sp = SparkSession.builder.appName("mLib").getOrCreate()
sp

In [92]:
sdf = sp.read.csv('shuttle_service_frq.csv', header=True, inferSchema=True)
sdf

DataFrame[Line No.: string, SHUTTLE NO.: int, Jan: int, Feb: int, Mar: int]

In [94]:
sdf.show(3)

+--------+-----------+---+---+---+
|Line No.|SHUTTLE NO.|Jan|Feb|Mar|
+--------+-----------+---+---+---+
|    SDL3|          3|  1|  0|  0|
|    SDL2|         26|  0|  0|  0|
|    SDL2|         27|  0|  0|  1|
+--------+-----------+---+---+---+
only showing top 3 rows



##### MLib

In [96]:
for col in sdf.columns:
    new_Col = col.replace(' ', '_').replace(".", '')
    sdf = sdf.withColumnRenamed(col, new_Col)

In [101]:
sdf.show(5)

+-------+----------+---+---+---+
|Line_No|SHUTTLE_NO|Jan|Feb|Mar|
+-------+----------+---+---+---+
|   SDL3|         3|  1|  0|  0|
|   SDL2|        26|  0|  0|  0|
|   SDL2|        27|  0|  0|  1|
|   SDL3|        28|  0|  0|  1|
|   SDL1|        29|  1|  0|  0|
+-------+----------+---+---+---+
only showing top 5 rows

