In [11]:
import findspark
import py4j
findspark.init("D:\logiciels\spark")

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

In [2]:
conf = SparkConf().set('spark.driver.host','127.0.0.1')
sc = SparkContext(master='local', appName='myAppName',conf=conf)
spark = SparkSession.builder\
                    .master("local[*]")\
                    .appName("CreateTable")\
                    .getOrCreate()

In [3]:
Adresse = sc.parallelize([["75","paris"], ["51","reims"], ["06","nice"]]).toDF(("id", "adress"))
Adresse.show()
Adresse.createOrReplaceTempView("Adresse_sql")

+---+------+
| id|adress|
+---+------+
| 75| paris|
| 51| reims|
| 06|  nice|
+---+------+



In [4]:
Prix = sc.parallelize([["1","10"], ["2","20"], ["3","70"]]).toDF(("IDM", "Prix"))
Prix.show()
Prix.createOrReplaceTempView("Prix_sql")

+---+----+
|IDM|Prix|
+---+----+
|  1|  10|
|  2|  20|
|  3|  70|
+---+----+



In [5]:
Produit = sc.parallelize([["1","Sony","souris"], ["2","Samsung","clavier"], ["3","Apple","ecran"]]).toDF(("IDM", "Marque","Produit"))
Produit.show()
Produit.createOrReplaceTempView("Produit_sql")

+---+-------+-------+
|IDM| Marque|Produit|
+---+-------+-------+
|  1|   Sony| souris|
|  2|Samsung|clavier|
|  3|  Apple|  ecran|
+---+-------+-------+



In [67]:
Vente = sc.parallelize([["3","75","toto","2","2021"], ["2","75","toto","1","2020"], ["1","51","toto","5","2021"],["2","51","fofo","6","2021"]]).toDF(("Article", "Agence","Client","Quantite","Annee"))
Vente.show()
Vente.createOrReplaceTempView("Vente_sql")

+-------+------+------+--------+-----+
|Article|Agence|Client|Quantite|Annee|
+-------+------+------+--------+-----+
|      3|    75|  toto|       2| 2021|
|      2|    75|  toto|       1| 2020|
|      1|    51|  toto|       5| 2021|
|      2|    51|  fofo|       6| 2021|
+-------+------+------+--------+-----+



# Déterminer les prix de chaque produit

In [7]:
#SQL
spark.sql("""SELECT Marque, Produit, Prix from Produit_sql
    left join Prix_sql
    on Produit_sql.IDM=Prix_sql.IDM""").show()

#DSL
Produit.join(Prix,["Idm"]).drop(col("Idm")).show()

+-------+-------+----+
| Marque|Produit|Prix|
+-------+-------+----+
|  Apple|  ecran|  70|
|   Sony| souris|  10|
|Samsung|clavier|  20|
+-------+-------+----+

+-------+-------+----+
| Marque|Produit|Prix|
+-------+-------+----+
|  Apple|  ecran|  70|
|   Sony| souris|  10|
|Samsung|clavier|  20|
+-------+-------+----+



# Déterminer les articles que toto a acheté en 2021

In [14]:
# SQL
spark.sql("""select  Client,Idm, Annee, Marque from vente_sql 
    left join produit_sql 
    on vente_sql.Article = produit_sql.Idm 
    where client = 'toto' 
    and annee = '2021' """).show()

+---+------+-----+------+
|Idm|Client|Annee|Marque|
+---+------+-----+------+
|  3|  toto| 2021| Apple|
|  1|  toto| 2021|  Sony|
+---+------+-----+------+



In [13]:
# DSL
Vente.join(Produit, col("Article") == col("Idm"))\
     .select("Client", "Annee", "Idm", "Marque")\
     .filter(col("Annee") == "2021")\
     .filter(col("Client") == "toto")\
     .show()

+------+-----+---+------+
|Client|Annee|Idm|Marque|
+------+-----+---+------+
|  toto| 2021|  3| Apple|
|  toto| 2021|  1|  Sony|
+------+-----+---+------+



# Montant dépensé par toto par agence

In [8]:
#SQL
spark.sql("""SELECT sum(prix*Quantite) as prixtot, Agence, Client
        from Vente_sql
        left join Prix_sql
        on Vente_sql.Article=Prix_sql.IDM
        WHERE Vente_sql.client ="toto" 
        GROUP BY Vente_sql.Agence, vente_sql.Client""").show()

+-------+------+------+
|prixtot|Agence|Client|
+-------+------+------+
|   50.0|    51|  toto|
|  160.0|    75|  toto|
+-------+------+------+



In [12]:
df = Vente.join(Prix, col("Article") == col("Idm"))\
     .filter(col("client") == "toto")\
     .withColumn("PrixTotal", col("Quantite")*col("Prix").cast(IntegerType()))\
     .groupBy("Agence")\
     .sum('PrixTotal')\
     .show()

+------+--------------+
|Agence|sum(PrixTotal)|
+------+--------------+
|    51|          50.0|
|    75|         160.0|
+------+--------------+



## Determiner le CA pour chaque agence en 2021

In [17]:
#SQL
spark.sql("""SELECT Agence, sum(prix*quantite) as prixtot
        from Vente_sql
        left join Prix_sql
        on Vente_sql.Article=Prix_sql.IDM
        WHERE Vente_sql.annee ="2021" 
        GROUP BY Vente_sql.Agence """).show()

+------+-------+
|Agence|prixtot|
+------+-------+
|    51|  170.0|
|    75|  140.0|
+------+-------+



In [16]:
Vente.join(Prix, col("Article") == col("Idm"))\
    .filter(col("Annee") == 2021)\
    .groupBy(col("agence"))\
    .agg(F.sum(col("quantite")*col("Prix").alias("Tot")).alias("Tot"))\
    .show()

+------+-----+
|agence|  Tot|
+------+-----+
|    51|170.0|
|    75|140.0|
+------+-----+



## 8- Quel est le produit le plus acheté en 2021 ? 

In [18]:
# SQL
spark.sql("""SELECT Produit, Quantite FROM Vente_SQL LEFT JOIN Produit_sql
        ON Vente_sql.Article = Produit_sql.Idm WHERE Annee = '2021' AND Quantite = (Select MAX(Quantite) FROM Vente_SQL)""").show()

+-------+--------+
|Produit|Quantite|
+-------+--------+
|clavier|       6|
+-------+--------+



In [83]:
#DSL
Vente.join(Produit, col("Article") == col("Idm"))\
    .filter(col("Annee") == 2021)\
    .groupBy(col("Produit"))\
    .agg({"Quantite": "max"})\
    .sort("max(Quantite)", ascending=False)\
    .limit(1)\
    .show()

+-------+-------------+
|Produit|max(Quantite)|
+-------+-------------+
|clavier|            6|
+-------+-------------+



## 9- Quel est le client qui a acheté le plus d’article. 

In [87]:
spark.sql("""Select Client from Vente_SQL group by Client order by sum(CAST(Quantite AS DOUBLE)) desc LIMIT 1""").show()

+------+
|Client|
+------+
|  toto|
+------+



In [85]:
Vente.groupBy(col("Client"))\
    .agg(F.sum(col("quantite").alias("Nb_Article")).alias("Nb_Article"))\
    .sort(col("Nb_Article"), ascending=False)\
    .limit(1)\
    .show()

+------+----------+
|Client|Nb_Article|
+------+----------+
|  toto|       8.0|
+------+----------+



 ## 10- Qui est le client qui a dépensé le plus d’argent

In [88]:
#SQL
spark.sql("""SELECT Client, sum(prix*Quantite) as depense
        from Vente_sql
        left join Prix_sql
        on Vente_sql.Article=Prix_sql.IDM 
        GROUP BY vente_sql.Client
        ORDER BY depense desc LIMIT 1""").show()

+------+-------+
|Client|depense|
+------+-------+
|  toto|  210.0|
+------+-------+



In [94]:
Vente.join(Prix, col("Article") == col("Idm"))\
    .groupBy(col("Client"))\
    .agg(F.sum(col("quantite")*col("Prix").alias("dépense")).alias("dépense"))\
    .sort(col("dépense"), ascending=False)\
    .limit(1)\
    .show()

+------+-------+
|Client|dépense|
+------+-------+
|  toto|  210.0|
+------+-------+

