## PySpark Practice

In [9]:
from pyspark.sql import SparkSession

In [10]:
saprk = SparkSession.builder.appName('Practice').getOrCreate()

In [38]:
df = saprk.read.csv('test.csv')

In [56]:
df = saprk.read.csv('test.csv', header=True, inferSchema=True)

In [57]:
df.show()

+----------+-----+------+
|      Name|Marks|Branch|
+----------+-----+------+
|Karthikeya|   70|   CSE|
|    Subash|   80|    IT|
|     Manas|   75|   CSE|
|      NULL|   77|  AIML|
|    Anvith|   95|   CSE|
|      NULL| NULL|  NULL|
+----------+-----+------+



In [17]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Marks: integer (nullable = true)



## Part1

In [18]:
df.columns

['Name', 'Marks']

In [21]:
df.head(3)

[Row(Name='Karthikeya', Marks=70),
 Row(Name='Subash', Marks=80),
 Row(Name='Manas', Marks=75)]

In [23]:
df.select('Name').show()

+----------+
|      Name|
+----------+
|Karthikeya|
|    Subash|
|     Manas|
+----------+



In [25]:
df.select(['Name', 'Marks']).show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|Karthikeya|   70|
|    Subash|   80|
|     Manas|   75|
+----------+-----+



In [26]:
df.dtypes

[('Name', 'string'), ('Marks', 'int')]

In [28]:
df.describe().show()

+-------+----------+-----+
|summary|      Name|Marks|
+-------+----------+-----+
|  count|         3|    3|
|   mean|      NULL| 75.0|
| stddev|      NULL|  5.0|
|    min|Karthikeya|   70|
|    max|    Subash|   80|
+-------+----------+-----+



In [31]:
df = df.withColumn('Marks', df['Marks']+2)

In [32]:
df.show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|Karthikeya|   72|
|    Subash|   82|
|     Manas|   77|
+----------+-----+



In [33]:
df.drop('Marks').show()

+----------+
|      Name|
+----------+
|Karthikeya|
|    Subash|
|     Manas|
+----------+



In [35]:
df.withColumnRenamed('Marks', 'New Marks').show()

+----------+---------+
|      Name|New Marks|
+----------+---------+
|Karthikeya|       72|
|    Subash|       82|
|     Manas|       77|
+----------+---------+



In [43]:
df.na.drop().show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|Karthikeya|   70|
|    Subash|   80|
|     Manas|   75|
+----------+-----+



In [45]:
df.na.drop('all').show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|Karthikeya|   70|
|    Subash|   80|
|     Manas|   75|
|      NULL|   77|
+----------+-----+



In [46]:
# tresh, subset parameters also exist

In [48]:
df.na.fill('Missing..', ['Name']).show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|Karthikeya|   70|
|    Subash|   80|
|     Manas|   75|
| Missing..|   77|
| Missing..| NULL|
+----------+-----+



In [50]:
df.filter('Marks>=75').show()

+------+-----+
|  Name|Marks|
+------+-----+
|Subash|   80|
| Manas|   75|
|  NULL|   77|
+------+-----+



In [52]:
df.filter(df['Marks'] >= 75).show()

+------+-----+
|  Name|Marks|
+------+-----+
|Subash|   80|
| Manas|   75|
|  NULL|   77|
+------+-----+



In [58]:
df.filter((df['Marks'] >= 75) & (df['Marks'] <= 85)).show()

+------+-----+------+
|  Name|Marks|Branch|
+------+-----+------+
|Subash|   80|    IT|
| Manas|   75|   CSE|
|  NULL|   77|  AIML|
+------+-----+------+



In [61]:
df.groupBy('Branch').sum().show()

+------+----------+
|Branch|sum(Marks)|
+------+----------+
|  AIML|        77|
|  NULL|      NULL|
|    IT|        80|
|   CSE|       240|
+------+----------+



In [62]:
df.groupBy('Branch').mean().show()

+------+----------+
|Branch|avg(Marks)|
+------+----------+
|  AIML|      77.0|
|  NULL|      NULL|
|    IT|      80.0|
|   CSE|      80.0|
+------+----------+



In [63]:
df.groupBy('Branch').count().show()

+------+-----+
|Branch|count|
+------+-----+
|  AIML|    1|
|  NULL|    1|
|    IT|    1|
|   CSE|    3|
+------+-----+



In [66]:
df.groupBy('Branch').agg({'Marks':'sum', 'Marks':'mean'}).show()

+------+----------+
|Branch|avg(Marks)|
+------+----------+
|  AIML|      77.0|
|  NULL|      NULL|
|    IT|      80.0|
|   CSE|      80.0|
+------+----------+



In [70]:
a,b = df.randomSplit([0.50, 0.50])

In [71]:
a.show(), b.show()

+----------+-----+------+
|      Name|Marks|Branch|
+----------+-----+------+
|    Anvith|   95|   CSE|
|Karthikeya|   70|   CSE|
|    Subash|   80|    IT|
+----------+-----+------+

+-----+-----+------+
| Name|Marks|Branch|
+-----+-----+------+
| NULL| NULL|  NULL|
| NULL|   77|  AIML|
|Manas|   75|   CSE|
+-----+-----+------+



(None, None)

In [72]:
saprk.stop()