In [199]:
## Import 

import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import format_number
from pyspark.sql.functions import max, min
from pyspark.sql.functions import mean
from pyspark.sql.functions import corr
from pyspark.sql.functions import year
from pyspark.sql.functions import month

In [200]:
## 1) Start a simple Spark Session

spark = SparkSession.builder.appName('walmart_stock').getOrCreate()
sc = spark.sparkContext # Lancement de la session Spark

In [201]:
## 2) Load the Walmart Stock CSV File

df = spark.read.option("header", True).csv(r"C:\Users\Hosti\Desktop\walmart_stock.csv")
df.show(3) # On charge la base de données et on lance les trois premières lignes pour être sûre que tout fonctionne

df.createOrReplaceTempView('Table')

spark.sql("""SELECT * FROM Table""").show() 

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 3 rows

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+-----------------

In [202]:
## 3) What are the column names ?

print("Les noms des colonnes sont :", df.columns) # On cherche les noms de colonnes


Les noms des colonnes sont : ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']


In [203]:
## 4) What does the Schema look like ?

df.printSchema() 

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



In [241]:
## 5) Create a new dataframe with a column called HV_Ratio that is the ratio of the High Priceversus volume of stock traded for a day

df_ratio = df.withColumn('HV_Ratio', df['High']/df['Volume']).select(['HV_Ratio'])
df_ratio.show()

spark.sql("""SELECT High/Volume as HV_Ratio FROM Table""").show() 

+--------------------+
|            HV_Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
|8.644477436914568E-6|
|9.351828421515645E-6|
| 8.29141562102703E-6|
|7.712212102001476E-6|
|7.071764823529412E-6|
|1.015495466386981E-5|
|6.576354146362592...|
| 5.90145296180676E-6|
|8.547679455011844E-6|
|8.420709512685392E-6|
|1.041448341728929...|
|8.316075414862431E-6|
|9.721183814992126E-6|
|8.029436027707578E-6|
|6.307432259386365E-6|
+--------------------+
only showing top 20 rows

+--------------------+
|            HV_Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
|8.644477436914568E-6|
|9.351828421515645E-6|
| 8.29141562102703E-6|
|7.712212102001476E-6|
|7.071764823529412E-6|
|1.015495466386981E-5|
|6.576354146362592...|
| 5.90145296180676E-6|
|8.547679455011844E-6|
|8.420709512685392E-6|
|1.04144

In [205]:
## 6) What day had the Peak High in Price?

print(u"Le jour avec le prix le plus élevé est le :", df.orderBy(df['High'].desc()).select(['Date']).head(1)[0]['Date'])

spark.sql("""SELECT MAX(High) FROM Table""").show()


Le jour avec le prix le plus élevé est le : 2015-01-13
+---------+
|max(High)|
+---------+
|90.970001|
+---------+



In [206]:
## 7) What is the mean of the Close column?

df.select(mean('Close')).show()

spark.sql("""SELECT MEAN(Close) as Moyenne FROM Table""").show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+

+-----------------+
|          Moyenne|
+-----------------+
|72.38844998012726|
+-----------------+



In [207]:
## 8) What is the max and min of the Volume column?

print("What is the max and min of the Volume column?")
df.select(max('Volume'),min('Volume')).show()

spark.sql("""SELECT MAX(HIGH) as MAX, MIN(Volume) as MIN FROM Table""").show()

What is the max and min of the Volume column?
+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|    9994400|   10010500|
+-----------+-----------+

+---------+--------+
|      MAX|     MIN|
+---------+--------+
|90.970001|10010500|
+---------+--------+



In [208]:
## 9) How many days was the Close lower than 60 dollars?

days = df.filter(df['Close'] < 60).count()
print("Il y avait", days ,"jours durant lesquels la valeur Close était inférieur à 60.")

spark.sql("""SELECT COUNT(Date) FROM Table WHERE Close < 60""").show()

Il y avait 81 jours durant lesquels la valeur Close était inférieur à 60.
+-----------+
|count(Date)|
+-----------+
|         81|
+-----------+



In [238]:
## 10) What percentage of the time was the High greater than 80 dollars ?

percent = df.filter('High > 80').count() * 100/df.count()
print("Il y avait", round(percent, 2), "% de temps durant lequel la valeur High était supérieur à 80 dollars.")

spark.sql("""SELECT High_sum = (SELECT COUNT(High) FROM Table WHERE High > 80) , High_sum/COUNT(High) as High2 FROM Table""").show()

Il y avait 8.43 % de temps durant lequel la valeur High était supérieur à 80 dollars.


AnalysisException: cannot resolve '`High_sum`' given input columns: [table.Adj Close, table.Close, table.Date, table.High, table.Low, table.Open, table.Volume]; line 1 pos 7;
'Aggregate [unresolvedalias(('High_sum = scalar-subquery#3381 []), None), ('High_sum / count(High#2589)) AS High2#3382]
:  +- Aggregate [count(High#2589) AS count(High)#3385L]
:     +- Filter (cast(High#2589 as int) > 80)
:        +- SubqueryAlias table
:           +- Relation[Date#2587,Open#2588,High#2589,Low#2590,Close#2591,Volume#2592,Adj Close#2593] csv
+- SubqueryAlias table
   +- Relation[Date#2587,Open#2588,High#2589,Low#2590,Close#2591,Volume#2592,Adj Close#2593] csv


In [212]:
## 11) What is the max High per year?

spark.sql("""SELECT YEAR(Date) as Year, MAX(High) as Max_High FROM Table GROUP BY Year ORDER BY Year""").show()

+----+---------+
|Year| Max_High|
+----+---------+
|2012|77.599998|
|2013|81.370003|
|2014|88.089996|
|2015|90.970001|
|2016|75.190002|
+----+---------+



In [213]:
## 12) What is the average Close for each Calendar Month?

spark.sql("""SELECT MONTH(Date), AVG(Close) as AVG_Close FROM Table GROUP BY MONTH(Date) ORDER BY MONTH(Date)""").show()

+-------------------------+-----------------+
|month(CAST(Date AS DATE))|        AVG_Close|
+-------------------------+-----------------+
|                        1|71.44801958415842|
|                        2|  71.306804443299|
|                        3|71.77794377570092|
|                        4|72.97361900952382|
|                        5|72.30971688679247|
|                        6| 72.4953774245283|
|                        7|74.43971943925233|
|                        8|73.02981855454546|
|                        9|72.18411785294116|
|                       10|71.57854545454543|
|                       11| 72.1110893069307|
|                       12|72.84792478301885|
+-------------------------+-----------------+

