In [0]:
import pyspark

In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/dataraiz91@gmail.com/sales_info.csv")
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|  200|
|   GOOG|Charlie|  120|
|   GOOG|  Frank|  340|
|   MSFT|   Tina|  600|
|   MSFT|    Amy|  124|
|   MSFT|Vanessa|  243|
|     FB|   Carl|  870|
|     FB|  Sarah|  350|
|   APPL|   John|  250|
|   APPL|  Linda|  130|
|   APPL|   Mike|  750|
|   APPL|  Chris|  350|
+-------+-------+-----+



In [0]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: string (nullable = true)



In [0]:
from pyspark.sql.types import StructField, IntegerType, StructType, StringType


In [0]:
data_schema = [StructField('Company', StringType(), True), StructField('Person', StringType(), True), StructField('Sales', IntegerType(), True)]
final_struc = StructType(fields=data_schema)
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/dataraiz91@gmail.com/sales_info.csv", schema=final_struc)

In [0]:
df.groupBy("Company").mean("Sales").show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            250.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [0]:
df.groupBy("Company").count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [0]:
df.agg({'Sales': 'sum'}).show()

+----------+
|sum(Sales)|
+----------+
|      3097|
+----------+



In [0]:
group_data = df.groupBy('Company')
group_data.agg({'Sales': 'max'}).show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|       250|
|   GOOG|       340|
|     FB|       870|
|   MSFT|       600|
+-------+----------+



In [0]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [0]:
df.select(avg('Sales').alias('Average Sales')).show()

+-----------------+
|    Average Sales|
+-----------------+
|344.1111111111111|
+-----------------+



In [0]:
df.select(stddev('Sales')).show()

+------------------+
|stddev_samp(Sales)|
+------------------+
|245.10530616678028|
+------------------+



In [0]:
from pyspark.sql.functions import format_number

In [0]:
sales_std = df.select(stddev('Sales').alias('std'))
sales_std.show()

+------------------+
|               std|
+------------------+
|245.10530616678028|
+------------------+



In [0]:
sales_std.select(format_number('std', 2).alias('round_std')).show()

+---------+
|round_std|
+---------+
|   245.11|
+---------+



In [0]:
df.orderBy(df['sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|  870|
|   MSFT|   Tina|  600|
|     FB|  Sarah|  350|
|   GOOG|  Frank|  340|
|   APPL|   John|  250|
|   MSFT|Vanessa|  243|
|   GOOG|    Sam|  200|
|   MSFT|    Amy|  124|
|   GOOG|Charlie|  120|
|   APPL|  Linda| null|
|   APPL|   Mike| null|
|   APPL|  Chris| null|
+-------+-------+-----+

