# Práctica Spark SQL

In [1]:
import findspark
findspark.init("/home/usuario/Escritorio/spark-2.0.2")

import pyspark
from pyspark.sql import SparkSession
spark = (SparkSession.builder
    .master("local[*]")
    .config("spark.driver.cores", 1)
    .appName("understanding_sparksession")
    .getOrCreate() )
sc = spark.sparkContext
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7f9bd1b18be0>
<pyspark.context.SparkContext object at 0x7f9bf8d632b0>


## Lectura de eventos

En primer lugar, definimos los tipos de datos específicos para cada campo con un esquema personalizado.

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
customSchema = StructType([StructField("Id", LongType(), True),
                           StructField("Parent_sys_id", StringType(), True),
                           StructField("Source", StringType(), True),
                           StructField("Mentions", StringType(), True),
                           StructField("Target", StringType(), True),
                           StructField("Name_source", StringType(), True),
                           StructField("Body", StringType(), True),
                           StructField("Pub_date", TimestampType(), True),
                           StructField("URLs", StringType(), True),
                           StructField("Tipe_action", StringType(), True),
                           StructField("Link", StringType(), True),
                           StructField("Has_link", ByteType(), True),
                           StructField("Has_picture", ByteType(), True),
                           StructField("Website", StringType(), True),
                           StructField("Country", StringType(), True),
                           StructField("Activity", LongType(), True),
                           StructField("Followers", LongType(), True),
                           StructField("Following", LongType(), True),
                           StructField("Location", StringType(), True)
                          ])

In [3]:
events = spark.read.csv("data/DATASETMotoGP-Qatar.csv",
                        header=True, schema=customSchema, timestampFormat="dd/MM/yyyy HH:mm")
                        #mode="FAILFAST") #

In [4]:
events.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Parent_sys_id: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Mentions: string (nullable = true)
 |-- Target: string (nullable = true)
 |-- Name_source: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Pub_date: timestamp (nullable = true)
 |-- URLs: string (nullable = true)
 |-- Tipe_action: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Has_link: byte (nullable = true)
 |-- Has_picture: byte (nullable = true)
 |-- Website: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Activity: long (nullable = true)
 |-- Followers: long (nullable = true)
 |-- Following: long (nullable = true)
 |-- Location: string (nullable = true)



## A) Contabilizar el número total de menciones a los pilotos Marc Márquez, Valentino Rossi y Dani Pedrosa.

In [5]:
marquez = events.filter(col("Mentions").rlike("marcmarquez93"))
marquez.count()

58117

In [6]:
pedrosa = events.filter(col("Mentions").rlike("26_danipedrosa"))
pedrosa.count()

12342

In [7]:
rossi = events.filter(col("Mentions").rlike("valeyellow46"))
rossi.count()

61121

## B) Contabilizar los 5 países que más tweets han publicado (considerando los tweets que contengan dicha información).

In [8]:
(events.filter(events.Country != "not public")
 .groupBy("Country")
 .agg(count("Id").alias("tweets"))
 .orderBy("tweets", ascending=False)
 .limit(5).show())

+-------+------+
|Country|tweets|
+-------+------+
|     es|172577|
|     us| 12722|
|     gb| 12588|
|     id|  8725|
|     it|  1843|
+-------+------+



## C) Contabilizar los 3 hashtags más utilizados (que aparezcan el mayor número de veces) en el cuerpo de los tweets (campo "body").

In [9]:
(events.filter(events.Body != None)
     .select("Body")
     .rdd.flatMap(lambda x: x.Body.split(" "))
     .filter(lambda x: x.startswith('#'))
     .map(lambda x: (x,1))
     .reduceByKey(lambda a,b: a+b)
     .takeOrdered(3, key = lambda x: -x[1]))   

[('#motogp', 51961), ('#qatar', 9977), ('#moto3', 5797)]

In [10]:
sc.stop()