Creating Pyspark Session

In [0]:
import pyspark
from pyspark.sql import SparkSession

# creating a spark session
spark = SparkSession.builder.appName("pyspark practice").getOrCreate()

display(spark)

In [0]:
%fs
ls /mnt/tf-abfss/data/ds/pyspark/bmw.csv

path,name,size
dbfs:/mnt/tf-abfss/data/ds/pyspark/bmw.csv,bmw.csv,592722


Reading a csv file

In [0]:
csv_file = '/mnt/tf-abfss/data/ds/pyspark/bmw.csv'
df = spark.read.csv(csv_file)

In [0]:
# displaying the dataframe without header is true
display(df.show(5))

In [0]:
data = spark.read.option('header',True).csv('/mnt/tf-abfss/data/ds/pyspark/bmw.csv')
display(data.show(5))

Creating a dataframe with header is true

In [0]:
# printing the schema of the data
data = spark.read.csv(
    '/mnt/tf-abfss/data/ds/pyspark/bmw.csv',
    sep = ',',
    header = True,
    )

data.printSchema()

In [0]:
# checking the data types
data.dtypes

In [0]:
# getting the first 3 rows of data from spark dataframe
data.head(3)

In [0]:
data.show(2)

In [0]:
data.first()

In [0]:
data.describe().show()

In [0]:
data.columns

In [0]:
data.count()

Creating a new column as stock_Date with the reference of Date column

In [0]:
data = data.withColumn('Year_of_purchase', data.year)

data.show(5)

Renaming the column

In [0]:
data = data.withColumnRenamed('Year_of_purchase', 'year_of_purchase')

data.show(5)

Dropping the extra created colun from the spark dataframe

In [0]:
# after dropping the added column we are having the existing columns
data = data.drop('year_of_purchas')

data.show(10)

In [0]:
##threshold
data.na.drop(how="any",thresh=3).show(2)

In [0]:
from pyspark.sql.functions import col, lit

checking the null values

In [0]:
data.filter(col("price").isNull()).show()

Imputting the missing values

In [0]:
# Remove Rows with Missing Values

data.na.drop()
data.show(5)

Selecting the single column and multiple column

In [0]:
## Selecting Single Column

data.select('mileage').show(5)

## Selecting Multiple columns

data.select(['fuelType','mileage', 'mpg', 'engineSize']).show(5)

filtering the column

In [0]:
from pyspark.sql.functions import col, lit

data.filter((col('mpg') >= lit(20)) | (col('mpg') <= lit(45))).show(5)

filtering the data by using the between condition

In [0]:
## fetch the data where the adjusted value is between 100.0 and 500.0

data.filter(data.year.between(2016, 2019)).show(2)

filtering the data by using the When condition

In [0]:
from pyspark.sql import functions as f
data.select('model', 'price','transmission', 
            f.when(data.engineSize >= 2.0, 1).otherwise(0)
           ).show(5)

filtering the data by using the like condition

In [0]:
data.select('model','engineSize', 
            data.engineSize.rlike('^[3]').alias('Engine size that with 3')
            ).distinct().show(5)