In [4]:
from pyspark.sql import SparkSession

In [5]:
spark=SparkSession.builder.appName('basics').getOrCreate()

In [4]:
df=spark.read.csv('E:\\ML\\abc\\Notes\\Loadcsv.csv',header=True)

In [5]:
df.show()

+-----------+-------+--------+--------------+
|Client Name|Country| Product|Purchase Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Maria Blue| Canada| Printer|           150|
|    Rita Yu| Brazil|  Laptop|         1,200|
|    Jack Mo|     UK| Monitor|           300|
|  Ron Green|   null|  Laptop|         1,200|
|  Jeff Long|  China|  Laptop|         1,200|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
|  Ben Baker| Russia| Printer|           150|
+-----------+-------+--------+--------------+



In [22]:
df.schema

StructType(List(StructField(Client Name,StringType,true),StructField(Country,StringType,true),StructField(Product,StringType,true),StructField(Purchase Price,StringType,true)))

In [6]:
df.printSchema()

root
 |-- Client Name: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Purchase Price: string (nullable = true)



In [7]:
df.columns

['Client Name', 'Country', 'Product', 'Purchase Price']

In [8]:
df.describe()

DataFrame[summary: string, Client Name: string, Country: string, Product: string, Purchase Price: string]

In [9]:
df.describe().show()

+-------+-----------+-------+--------+------------------+
|summary|Client Name|Country| Product|    Purchase Price|
+-------+-----------+-------+--------+------------------+
|  count|         10|      7|      10|                10|
|   mean|       null|   null|    null|492.85714285714283|
| stddev|       null|   null|    null|  304.724700110022|
|    min|  Ben Baker| Brazil|Computer|             1,200|
|    max|  Ron Green|     UK|  Tablet|               800|
+-------+-----------+-------+--------+------------------+



In [10]:
from pyspark.sql.types import (StructField,IntegerType,StructType,StringType)

In [60]:
data_schema=[StructField('Client_Name',StringType(),True),
             StructField('Country',StringType(),True),
             StructField('Product',StringType(),True),
             StructField('Purchase_Price',IntegerType(),True)]

In [61]:
final=StructType(fields=data_schema)

In [62]:
df=spark.read.csv('E:\\ML\\abc\\Notes\\Loadcsv.csv',schema=final,header=True)

In [63]:
df.show()

+-----------+-------+--------+--------------+
|Client_Name|Country| Product|Purchase_Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Maria Blue| Canada| Printer|           150|
|    Rita Yu| Brazil|  Laptop|          null|
|    Jack Mo|     UK| Monitor|           300|
|  Ron Green|   null|  Laptop|          null|
|  Jeff Long|  China|  Laptop|          null|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
|  Ben Baker| Russia| Printer|           150|
+-----------+-------+--------+--------------+



In [64]:
df.printSchema()

root
 |-- Client_Name: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Purchase_Price: integer (nullable = true)



In [66]:
df['Client_Name']

Column<b'Client_Name'>

In [67]:
type(df['Client_Name'])

pyspark.sql.column.Column

In [68]:
df.select('Client_Name')

DataFrame[Client_Name: string]

In [69]:
df.select('Client_Name').show()

+-----------+
|Client_Name|
+-----------+
|  Jon Smith|
|Bill Martin|
| Maria Blue|
|    Rita Yu|
|    Jack Mo|
|  Ron Green|
|  Jeff Long|
| Carrie Lan|
|  Marry Sig|
|  Ben Baker|
+-----------+



In [70]:
type(df.select('Client_Name'))

pyspark.sql.dataframe.DataFrame

In [41]:
df.head(2)[0]

Row(Client Name='Jon Smith', Country='Japan', Product='Computer', Purchase Price=800)

In [45]:
type(df.head(2))

list

In [44]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [71]:
df.select(['Client_Name','Purchase_Price'])

DataFrame[Client_Name: string, Purchase_Price: int]

In [72]:
df.select(['Client_Name','Purchase_Price']).show()

+-----------+--------------+
|Client_Name|Purchase_Price|
+-----------+--------------+
|  Jon Smith|           800|
|Bill Martin|           450|
| Maria Blue|           150|
|    Rita Yu|          null|
|    Jack Mo|           300|
|  Ron Green|          null|
|  Jeff Long|          null|
| Carrie Lan|           800|
|  Marry Sig|           800|
|  Ben Baker|           150|
+-----------+--------------+



In [73]:
df.withColumn('Discount_price',df['Purchase_Price']/2).show()

+-----------+-------+--------+--------------+--------------+
|Client_Name|Country| Product|Purchase_Price|Discount_price|
+-----------+-------+--------+--------------+--------------+
|  Jon Smith|  Japan|Computer|           800|         400.0|
|Bill Martin|   null|  Tablet|           450|         225.0|
| Maria Blue| Canada| Printer|           150|          75.0|
|    Rita Yu| Brazil|  Laptop|          null|          null|
|    Jack Mo|     UK| Monitor|           300|         150.0|
|  Ron Green|   null|  Laptop|          null|          null|
|  Jeff Long|  China|  Laptop|          null|          null|
| Carrie Lan|  Italy|Computer|           800|         400.0|
|  Marry Sig|   null|Computer|           800|         400.0|
|  Ben Baker| Russia| Printer|           150|          75.0|
+-----------+-------+--------+--------------+--------------+



In [49]:
df.show()

+-----------+-------+--------+--------------+
|Client Name|Country| Product|Purchase Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Maria Blue| Canada| Printer|           150|
|    Rita Yu| Brazil|  Laptop|          null|
|    Jack Mo|     UK| Monitor|           300|
|  Ron Green|   null|  Laptop|          null|
|  Jeff Long|  China|  Laptop|          null|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
|  Ben Baker| Russia| Printer|           150|
+-----------+-------+--------+--------------+



In [82]:
df.withColumnRenamed('Purchase_Price','Purchase_Price').show()

+-----------+-------+--------+--------------+
|Client_Name|Country| Product|Purchase_Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Maria Blue| Canada| Printer|           150|
|    Rita Yu| Brazil|  Laptop|          null|
|    Jack Mo|     UK| Monitor|           300|
|  Ron Green|   null|  Laptop|          null|
|  Jeff Long|  China|  Laptop|          null|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
|  Ben Baker| Russia| Printer|           150|
+-----------+-------+--------+--------------+



In [83]:
df.createOrReplaceTempView('Loadcsv')

In [84]:
results=spark.sql("SELECT * FROM Loadcsv").show()

+-----------+-------+--------+--------------+
|Client_Name|Country| Product|Purchase_Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Maria Blue| Canada| Printer|           150|
|    Rita Yu| Brazil|  Laptop|          null|
|    Jack Mo|     UK| Monitor|           300|
|  Ron Green|   null|  Laptop|          null|
|  Jeff Long|  China|  Laptop|          null|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
|  Ben Baker| Russia| Printer|           150|
+-----------+-------+--------+--------------+



In [85]:
new1=spark.sql("select * from Loadcsv where Purchase_Price>400")

In [87]:
new1.show()

+-----------+-------+--------+--------------+
|Client_Name|Country| Product|Purchase_Price|
+-----------+-------+--------+--------------+
|  Jon Smith|  Japan|Computer|           800|
|Bill Martin|   null|  Tablet|           450|
| Carrie Lan|  Italy|Computer|           800|
|  Marry Sig|   null|Computer|           800|
+-----------+-------+--------+--------------+



In [9]:
df=spark.read.csv("C:\\Users\YogeshR\Downloads\Python-and-Spark-for-Big-Data-master\Spark_DataFrames",header=True,inferSchema=True)

In [10]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



In [11]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [13]:
df.head(3)[0]

Row(Date='2010-01-04', Open='213.429998', High='214.499996', Low='212.38000099999996', Close='214.009998', Volume='123432400', Adj Close='27.727039')

In [17]:
df.filter("close<500").select(['Open','Close']).show()

+------------------+------------------+
|              Open|             Close|
+------------------+------------------+
|        213.429998|        214.009998|
|        214.599998|        214.379993|
|        214.379993|        210.969995|
|            211.75|            210.58|
|        210.299994|211.98000499999998|
|212.79999700000002|210.11000299999998|
|209.18999499999998|        207.720001|
|        207.870005|        210.650002|
|210.11000299999998|            209.43|
|210.92999500000002|            205.93|
|        208.330002|        215.039995|
|        214.910006|            211.73|
|        212.079994|        208.069996|
|206.78000600000001|            197.75|
|202.51000200000001|        203.070002|
|205.95000100000001|        205.940001|
|        206.849995|        207.880005|
|        204.930004|        199.289995|
|        201.079996|        192.060003|
|192.36999699999998|        194.729998|
+------------------+------------------+
only showing top 20 rows



In [18]:
df.filter(df['close']<500).select('Volume').show()

+---------+
|   Volume|
+---------+
|123432400|
|150476200|
|138040000|
|119282800|
|111902700|
|115557400|
|148614900|
|151473000|
|108223500|
|148516900|
|182501900|
|153038200|
|152038600|
|220441900|
|266424900|
|466777500|
|430642100|
|293375600|
|311488100|
|187469100|
+---------+
only showing top 20 rows



In [23]:
df.filter((df['close']<200) & (df['open']<200)).show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-02-01|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|
|2010-02-02|        195.909998|        196.319994|193.37999299999998|        195.859997|174585600|25.375532999999997|
|2010-02-03|        195.169994|        200.200003|        194.420004|        199.229994|153832000|25.812148999999998|
|2010-02-04|        196.730003|        198.370001|        191.570005|        192.050003|189413000|         24.881912|
|2010-02-05|192.63000300000002|             196.0|        190.850002|        195.460001|212576700|25.323710000000002|
|2010-02-08|        195.690006|197.88000300000002|      

In [35]:
result=df.filter(df['Low']==197.16).collect()

In [36]:
result

[Row(Date='2010-01-22', Open='206.78000600000001', High='207.499996', Low='197.16', Close='197.75', Volume='220441900', Adj Close='25.620401')]

In [38]:
row=result[0]

In [40]:
row.asDict()['Volume']

'220441900'

In [44]:
df=spark.read.csv("C:\\Users\YogeshR\Downloads\Python-and-Spark-for-Big-Data-master\Spark_DataFrames\sales_info.csv",header=True,inferSchema=True)

In [45]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [46]:
df.groupBy("Company")

<pyspark.sql.group.GroupedData at 0x2381555e670>

In [48]:
df.groupBy("Company").mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [50]:
df.groupBy("Company").sum().show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+



In [51]:
df.groupBy("Company").count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [52]:
df.agg({'sales':'sum'}).show()

+----------+
|sum(sales)|
+----------+
|    4327.0|
+----------+



In [53]:
df.agg({'Sales':'max'}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [54]:
group_data=df.groupBy("Company")

In [55]:
group_data.agg({'Sales':'max'}).show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [58]:
from pyspark.sql.functions import count,countDistinct,stddev,avg

In [61]:
df.select(avg('Sales').alias('Avg')).show()

+-----------------+
|              Avg|
+-----------------+
|360.5833333333333|
+-----------------+



In [63]:
df.select(stddev('Sales').alias('Stddev_Sales')).show()

+------------------+
|      Stddev_Sales|
+------------------+
|250.08742410799007|
+------------------+



In [64]:
from pyspark.sql.functions import format_number

In [65]:
sales_std=df.select(stddev('Sales').alias('stddev_Sales'))

In [68]:
sales_std.select(format_number('stddev_Sales',2).alias('final')).show()

+------+
| final|
+------+
|250.09|
+------+



In [69]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [70]:
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [71]:
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



In [72]:
df=spark.read.csv("C:\\Users\YogeshR\Downloads\Python-and-Spark-for-Big-Data-master\Spark_DataFrames\ContainsNull.csv",header=True,inferSchema=True)

In [73]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [75]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [76]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [77]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [78]:
df.na.drop(subset='Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [79]:
df.na.fill('Fill Value').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|Fill Value| null|
|emp3|Fill Value|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [80]:
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [81]:
df.na.fill('No Name',subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [82]:
from pyspark.sql.functions import mean

In [88]:
mean_val=df.select(mean(df['sales'])).collect()

In [89]:
mean_val

[Row(avg(sales)=400.5)]

In [90]:
mean_val[0]

Row(avg(sales)=400.5)

In [92]:
mean_sales=mean_val[0][0]

In [93]:
df.na.fill(mean_sales,['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [94]:
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
df=spark.read.csv("E:\\ML\\abc\\Notes\\New folder\\Python-and-Spark-for-Big-Data-master\\Python-and-Spark-for-Big-Data-master\\Spark_DataFrames\\appl_stock.csv",header=True,inferSchema=True)

In [7]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [8]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [9]:
df.head(1)

[Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [10]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [176]:
df.select(['Date','Open']).show()

+----------+----------+
|      Date|      Open|
+----------+----------+
|04-01-2010|213.429998|
|05-01-2010|214.599998|
|06-01-2010|214.379993|
|07-01-2010|    211.75|
|08-01-2010|210.299994|
|11-01-2010|212.799997|
|12-01-2010|209.189995|
|13-01-2010|207.870005|
|14-01-2010|210.110003|
|15-01-2010|210.929995|
|19-01-2010|208.330002|
|20-01-2010|214.910006|
|21-01-2010|212.079994|
|22-01-2010|206.780006|
|25-01-2010|202.510002|
|26-01-2010|205.950001|
|27-01-2010|206.849995|
|28-01-2010|204.930004|
|29-01-2010|201.079996|
|01-02-2010|192.369997|
+----------+----------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import (dayofmonth,hour,dayofyear,month,year,weekofyear,
                                   format_number,date_format,avg)

In [12]:
df.select(dayofmonth('Date')).show()

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [13]:
df.select(hour('Date')).show()

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



In [15]:
df.select(month('Date')).show()

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



In [17]:
#df.select(year(df['Date'])).show()
newf=df.withColumn("Year",year(df["Date"]))

In [18]:
#newdf.groupBy("Year").mean().show()
newf.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|
+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|
|2010-01-11|212.

In [21]:
result=newf.groupBy("Year").mean().select('Year','avg(Close)')

In [23]:
abc=result.withColumnRenamed("avg(Close)","Average Year For Closing")

In [27]:
abc.select(['Year',format_number("Average Year For Closing",2).alias("Avg Year Closing")]).show()

+----+----------------+
|Year|Avg Year Closing|
+----+----------------+
|2015|          120.04|
|2013|          472.63|
|2014|          295.40|
|2012|          576.05|
|2016|          104.60|
|2010|          259.84|
|2011|          364.00|
+----+----------------+

