In [1]:
import findspark
findspark.init()

In [8]:
import pyspark
sc = pyspark.SparkContext()

 #  RDD API Examples

## Word Count
In this example, we use a few transformations to build a dataset of (String, Int) pairs called counts and then save it to a file.
```
sc.textFile(name, minPartitions=None, use_unicode=True)
Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
```


In [10]:
import os


text_file = sc.textFile(os.getcwd()+"/../datasets/quijote.txt")

counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts = counts.sortBy(lambda a: a[1], ascending=False)
#NOTE: sortBy is not as efficient as sortByKey since it involves keying by the values,
#sorting by the keys, and then grabbing the values 
counts.take(50)

#counts.saveAsTextFile(os.path.join("/notebooks/","quixote-counts.txt"))


[('que', 19429),
 ('de', 17985),
 ('y', 15894),
 ('la', 10200),
 ('a', 9519),
 ('', 9195),
 ('el', 7957),
 ('en', 7898),
 ('no', 5603),
 ('se', 4690),
 ('los', 4680),
 ('con', 4047),
 ('por', 3758),
 ('las', 3423),
 ('lo', 3387),
 ('le', 3382),
 ('su', 3319),
 ('don', 2533),
 ('del', 2464),
 ('me', 2344),
 ('como', 2226),
 ('es', 1990),
 ('un', 1927),
 ('más', 1823),
 ('si', 1779),
 ('yo', 1703),
 ('al', 1696),
 ('mi', 1684),
 ('para', 1419),
 ('ni', 1350),
 ('una', 1300),
 ('y,', 1250),
 ('tan', 1217),
 ('porque', 1189),
 ('o', 1159),
 ('sin', 1139),
 ('que,', 1069),
 ('sus', 1047),
 ('ha', 1038),
 ('él', 1034),
 ('había', 1006),
 ('ser', 997),
 ('todo', 963),
 ('Sancho', 950),
 ('Quijote', 893),
 ('-dijo', 873),
 ('bien', 862),
 ('-respondió', 813),
 ('vuestra', 792),
 ('señor', 732)]

## Pi Estimation

Spark can also be used for compute-intensive tasks. This code estimates π by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be π / 4, so we use this to get our estimate.

In [11]:
import random

NUM_SAMPLES=12000000

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = sc.parallelize(range(0, NUM_SAMPLES)) \
             .filter(inside).count()
print ("Pi is roughly {}".format(4.0 * count / NUM_SAMPLES))

Pi is roughly 3.1407213333333335


# DataFrame API Examples

In this example, we count al quijote lines mentioning Dulcinea.

In [4]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col

spark = SparkSession(sc)

text_file = sc.textFile(os.getcwd()+"/../datasets/quijote.txt")
# Creates a DataFrame having a single column named "line"
df = text_file.map(lambda r: Row(r)).toDF(["line"])
dulcinea_lines = df.filter(col("line").like("%Dulcinea%"))
# Counts all the Dulcinea lines
print("There are {} lines with 'Dulcinea'".format(dulcinea_lines.count()))
# Counts lines mentioning Dulcinea and Quijote
print("There are {} lines with 'Dulcinea' and 'Quijote'".format(
    dulcinea_lines.filter(col("line").like("%Quijote%")).count()))
# Fetches the lines as an array of strings
dulcinea_lines.filter(col("line").like("%Quijote%")).collect()

There are 282 lines with 'Dulcinea'
There are 12 lines with 'Dulcinea' and 'Quijote'


[Row(line='aquella noche no durmió don Quijote, pensando en su señora Dulcinea, por'),
 Row(line='Quijote pedía, y sin preguntar quién Dulcinea fuese, le prometió que el'),
 Row(line='Carta de don Quijote a Dulcinea del Toboso'),
 Row(line='don Quijote que si, en nombrando a Dulcinea, no decía también del Toboso,'),
 Row(line='Don Quijote, que tales blasfemias oyó decir contra su señora Dulcinea, no'),
 Row(line='-Si no fue la que llevaste a la señora Dulcinea -replicó don Quijote-, yo'),
 Row(line='a don Quijote que qué nuevas tenía de la señora Dulcinea, y que si le había'),
 Row(line='-A eso puedo decir -respondió don Quijote- que Dulcinea es hija de sus'),
 Row(line='Dulcinea del Toboso, ni le llevó la carta del señor don Quijote, porque se'),
 Row(line='encantada Dulcinea; en don Quijote, por no poder asegurarse si era verdad o'),
 Row(line='que pinta a don Quijote ya desenamorado de Dulcinea del Toboso.'),
 Row(line='-No quiero saber más -dijo don Quijote-; que como yo vea a Dulc

### Exploring the superheroes dataset

In [8]:
superhero_df = spark.read.csv(os.getcwd()+"/../datasets/superheroes_info.csv", inferSchema = True, header = True)
superhero_df.show(10)

+---+---------------+--------+--------+------+---------+------+------+------+--------+---------+---------+-------------+------+-----------+-------------------+--------------------+
|_c0|           Name|Identity|  Status|Gender|Alignment|  Race|Height|Weight|EyeColor|HairColor|SkinColor|    Publisher|  Year|Appearances|    FirstAppearance|      AdditionalData|
+---+---------------+--------+--------+------+---------+------+------+------+--------+---------+---------+-------------+------+-----------+-------------------+--------------------+
|  0|     Spider-Man|  Secret|  Living|  Male|     Good| Human| 178.0|  74.0|   Hazel|    Brown|     null|Marvel Comics|1962.0|     4043.0|1962-08-01 00:00:00|        Peter Parker|
|  1|     Spider-Man|  Secret|  Living|  Male|     Good| Human| 178.0|  77.0|     Red|    Brown|     null|Marvel Comics|1962.0|     4043.0|1962-08-01 00:00:00|        Peter Parker|
|  2|     Spider-Man|  Secret|  Living|  Male|     Good| Human| 157.0|  56.0|   Brown|    Black

In [12]:
publisher_df = superhero_df.groupby("Publisher").count().show()

+-----------------+-----+
|        Publisher|count|
+-----------------+-----+
|        Rebellion|    1|
|               DC| 6808|
|    HarperCollins|    7|
| J. R. R. Tolkien|    1|
|        Star Trek|    6|
|    Marvel Comics|  482|
|        Wildstorm|    3|
|             null|   13|
|       South Park|    1|
|    Sony Pictures|    2|
|      Titan Books|    1|
|      ABC Studios|    4|
|             SyFy|    5|
|     Image Comics|   15|
|Universal Studios|    1|
|   IDW Publishing|    4|
|           Marvel|16109|
|     NBC - Heroes|   19|
|    Hanna-Barbera|    1|
|        DC Comics|  241|
+-----------------+-----+
only showing top 20 rows



### Spark SQL Example

In [17]:
superhero_df.createOrReplaceTempView("superhero_table")
spark.sql("select Name,Gender,Status from superhero_table").show()

+---------------+------+--------+
|           Name|Gender|  Status|
+---------------+------+--------+
|     Spider-Man|  Male|  Living|
|     Spider-Man|  Male|  Living|
|     Spider-Man|  Male|  Living|
|Captain America|  Male|  Living|
|Captain America|  Male|  Living|
|Captain America|  Male|  Living|
|      Wolverine|  Male|  Living|
|      Wolverine|  Male|  Living|
|       Iron Man|  Male|  Living|
|       Iron Man|  Male|Deceased|
|       Iron Man|  Male|  Living|
|           Thor|  Male|  Living|
|           Thor|  Male|  Living|
|           Thor|  Male|  Living|
|           Thor|  Male|  Living|
| Benjamin Grimm|  Male|  Living|
| Benjamin Grimm|  Male|  Living|
| Benjamin Grimm|  Male|Deceased|
| Benjamin Grimm|  Male|  Living|
| Benjamin Grimm|  Male|  Living|
+---------------+------+--------+
only showing top 20 rows



# Machine Learning Examples

## Prediction with Logistic Regression

In [5]:
# Every record of this DataFrame contains the label and
# features represented by a vector.
df = spark.createDataFrame(data, ["label", "features"])

# Set parameters for the algorithm.
# Here, we limit the number of iterations to 10.
lr = LogisticRegression(maxIter=10)

# Fit the model to the data.
model = lr.fit(df)

# Given a dataset, predict each point's label, and show the results.
model.transform(df).show()

NameError: name 'data' is not defined

In [12]:

sc.stop()
