In [1]:
sqlContext

<pyspark.sql.context.SQLContext at 0x7fb83a8a80f0>

In [3]:
# Lire un fichier text
df = sqlContext.read.text("/home/vagrant/book.txt")

In [6]:
# Afficher toutes les lignes
df.take(3)


[Row(value='The Project Gutenberg eBook of Beware, The Usurpers!, by Geoff St. Reynard'),
 Row(value=''),
 Row(value='This eBook is for the use of anyone anywhere in the United States and')]

In [7]:
# Lire un fichier structuré Json
people = spark.read.json("/usr/local/spark/examples/src/main/resources/people.json")

In [9]:
# Afficher le contenu du dataFrame
people.show()
people.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [14]:
# Afficher toutes les colonnes
people.select("*").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
# Afficher les colonnes sélectionés
people.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [21]:
# Effectuer des modification sur une colonne
people.select(people.name,(people.age+10).alias('age'))
df.show()

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  40|
| Justin|  29|
+-------+----+



In [22]:
# Création d'une liste
data=[('Alice',1),('Bob',2)]

In [25]:
# Création d'un dataFrame à partir d'une liste
df=sqlContext.createDataFrame(data,['name','age'])
df.show()
df.printSchema()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
|  Bob|  2|
+-----+---+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [26]:
# Création d'un UDF (USER DEFINED FUNCTION)
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

doubled = udf(lambda s:s*2, IntegerType())



df2=df.select(df.name,doubled(df.age).alias('age'))
df2.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  2|
|  Bob|  4|
+-----+---+



In [24]:
# Appliquer un filtre
df3=df2.filter(df2.age>3)
df3.show()

+----+---+
|name|age|
+----+---+
| Bob|  4|
+----+---+



In [27]:
# Nettoyer une liste
data2=[('Alice',1),('Bob',2),('Bob',2)]
df=sqlContext.createDataFrame(data2,['name','age'])
df.show()
df2=df.distinct()
df2.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
|  Bob|  2|
|  Bob|  2|
+-----+---+

+-----+---+
| name|age|
+-----+---+
|  Bob|  2|
|Alice|  1|
+-----+---+



In [28]:
# Trier une liste par rapport à une colonne
df3=df2.sort('age')
df3.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
|  Bob|  2|
+-----+---+



In [34]:
# Faire un group by
data=[('Alice', 'Paris',30),('Elie', 'Lille',28),('Joe', 'Lyon',39),('Franc','Lyon',47)]
df=sqlContext.createDataFrame(data,['nom','ville','age'])
df.show()
df.groupBy(df.ville).avg().show()

+-----+-----+---+
|  nom|ville|age|
+-----+-----+---+
|Alice|Paris| 30|
| Elie|Lille| 28|
|  Joe| Lyon| 39|
|Franc| Lyon| 47|
+-----+-----+---+

+-----+--------+
|ville|avg(age)|
+-----+--------+
|Lille|    28.0|
|Paris|    30.0|
| Lyon|    43.0|
+-----+--------+



In [37]:
data=[('Alice',1,6),('Bob',2,8),('Alice',3,9),('Bob',4,7)]
df=sqlContext.createDataFrame(data,['name','age','grade'])
df.groupBy('name').avg().show()

+-----+--------+----------+
| name|avg(age)|avg(grade)|
+-----+--------+----------+
|  Bob|     3.0|       7.5|
|Alice|     2.0|       7.5|
+-----+--------+----------+



In [38]:
df.groupBy('name').avg('age','grade').collect()

[Row(name=u'Bob', avg(age)=3.0, avg(grade)=7.5),
 Row(name=u'Alice', avg(age)=2.0, avg(grade)=7.5)]

In [25]:
df.groupBy('name').avg('age').collect()

[Row(name=u'Bob', avg(age)=3.0), Row(name=u'Alice', avg(age)=2.0)]

In [26]:
df.groupBy().avg('age','grade').collect()

[Row(avg(age)=2.5, avg(grade)=7.5)]

In [27]:
df.groupBy().avg('age').collect()

[Row(avg(age)=2.5)]

In [28]:
df.groupBy().avg().collect()

[Row(avg(age)=2.5, avg(grade)=7.5)]

In [29]:
df.groupBy('name').min('age','grade').collect()

[Row(name=u'Bob', min(age)=2, min(grade)=7),
 Row(name=u'Alice', min(age)=1, min(grade)=6)]