In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Learning').getOrCreate()

In [4]:
spark

In [5]:
df_spark = spark.read.csv('data.csv')

In [7]:
df_spark.show()

+-------+---+
|    _c0|_c1|
+-------+---+
|   Name|age|
| George| 23|
|Githiri| 54|
+-------+---+



In [9]:
df_spark = spark.read.option('header', 'true').csv('data.csv')

In [10]:
df_spark.show()

+-------+---+
|   Name|age|
+-------+---+
| George| 23|
|Githiri| 54|
+-------+---+



In [11]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [12]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)



In [18]:
## Read The dataset
Df_spark= spark.read.option('header', 'true').csv('data.csv', inferSchema=True)

In [19]:
Df_spark.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
| George| 23|         4|
|Githiri| 54|         2|
|   John| 56|        10|
+-------+---+----------+



In [20]:
Df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [21]:
df_spark = spark.read.csv('data.csv', header=True, inferSchema=True)
df_spark.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
| George| 23|         4|
|Githiri| 54|         2|
|   John| 56|        10|
+-------+---+----------+



In [22]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [24]:
Df_spark.select('Name').show()

+-------+
|   Name|
+-------+
| George|
|Githiri|
|   John|
+-------+



In [25]:
Df_spark.select(['Name', 'Experience']).show()

+-------+----------+
|   Name|Experience|
+-------+----------+
| George|         4|
|Githiri|         2|
|   John|        10|
+-------+----------+



In [26]:
Df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [27]:
df_spark.describe().show()

+-------+------+------------------+-----------------+
|summary|  Name|               age|       Experience|
+-------+------+------------------+-----------------+
|  count|     3|                 3|                3|
|   mean|  null|44.333333333333336|5.333333333333333|
| stddev|  null|18.502252115170556|4.163331998932265|
|    min|George|                23|                2|
|    max|  John|                56|               10|
+-------+------+------------------+-----------------+



In [29]:
# Adding Columns in dataframe
df_spark.withColumn('Experience After 2 years', df_spark['Experience']+2).show()

+-------+---+----------+------------------------+
|   Name|age|Experience|Experience After 2 years|
+-------+---+----------+------------------------+
| George| 23|         4|                       6|
|Githiri| 54|         2|                       4|
|   John| 56|        10|                      12|
+-------+---+----------+------------------------+



In [30]:
df_spark.drop('Experience After 2 years')

DataFrame[Name: string, age: int, Experience: int]

In [31]:
df_spark.show()

+-------+---+----------+
|   Name|age|Experience|
+-------+---+----------+
| George| 23|         4|
|Githiri| 54|         2|
|   John| 56|        10|
+-------+---+----------+



In [32]:
df_spark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|  George| 23|         4|
| Githiri| 54|         2|
|    John| 56|        10|
+--------+---+----------+



In [33]:
df_spark = spark.read.option('header', 'true').csv('data.csv')
df_spark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
| George|  23|         4| 30000|
|Githiri|  54|         2| 25000|
|   John|  56|        10| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [34]:
# Drop Columns
df_spark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  23|         4| 30000|
|  54|         2| 25000|
|  56|        10| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [35]:
df_spark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
| George|  23|         4| 30000|
|Githiri|  54|         2| 25000|
|   John|  56|        10| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [36]:
df_spark.na.drop().show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| George| 23|         4| 30000|
|Githiri| 54|         2| 25000|
|   John| 56|        10| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [39]:
df_spark.na.drop(how='any', thresh=2).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
| George|  23|         4| 30000|
|Githiri|  54|         2| 25000|
|   John|  56|        10| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
|   null|  34|        10| 38000|
+-------+----+----------+------+



In [40]:
df_spark.na.drop(how='any', subset=['Name']).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
| George|  23|         4| 30000|
|Githiri|  54|         2| 25000|
|   John|  56|        10| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|Shubham|  23|         2| 18000|
| Mahesh|null|      null| 40000|
+-------+----+----------+------+



In [41]:
# Filling Missing Values
df_spark.na.fill('Missing').show()

+-------+-------+----------+-------+
|   Name|    age|Experience| Salary|
+-------+-------+----------+-------+
| George|     23|         4|  30000|
|Githiri|     54|         2|  25000|
|   John|     56|        10|  20000|
|   Paul|     24|         3|  20000|
| Harsha|     21|         1|  15000|
|Shubham|     23|         2|  18000|
| Mahesh|Missing|   Missing|  40000|
|Missing|     34|        10|  38000|
|Missing|     36|   Missing|Missing|
+-------+-------+----------+-------+



In [42]:
# Filter Operations
# Salary less than 20000
df_spark.filter("Salary<=20000").show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|   John| 56|        10| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [43]:
df_spark.filter("Salary<=20000").select(['Name', 'age']).show()

+-------+---+
|   Name|age|
+-------+---+
|   John| 56|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [44]:
df_spark.filter(df_spark['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|   John| 56|        10| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [45]:
df_spark.filter((df_spark['Salary']<=20000)&(df_spark['Salary']>=15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|   John| 56|        10| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [47]:
training = spark.read.option('header', 'true').csv('data.csv')

In [48]:
training.show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| George| 23|         4| 30000|
|Githiri| 54|         2| 25000|
|   John| 56|        10| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+

