In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import dataframe
from pyspark.sql import Row
from pyspark.sql import SparkSession 
from pyspark.sql import DataFrame 
from pyspark.sql import Column 
from pyspark.sql import Row 
from pyspark.sql import GroupedData 
from pyspark.sql import DataFrameNaFunctions 
from pyspark.sql import DataFrameStatFunctions
from pyspark.sql import functions 
from pyspark.sql import types 
from pyspark.sql import Window 
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext

In [4]:
spark =SparkSession.builder.appName("data").master("local[2]").config(conf=conf).getOrCreate()

In [9]:
d1 = spark.createDataFrame(sc.parallelize([Row(name='Geoffrey', age=28, height=80), \
                                           Row(name='Sagini', age=29, height =80),\
                                           Row(name='Sagini', age = 30, height=80)]))

In [10]:
d1.show()

+---+------+--------+
|age|height|    name|
+---+------+--------+
| 28|    80|Geoffrey|
| 29|    80|  Sagini|
| 30|    80|  Sagini|
+---+------+--------+



In [11]:
d1.dropDuplicates().show()

+---+------+--------+
|age|height|    name|
+---+------+--------+
| 28|    80|Geoffrey|
| 29|    80|  Sagini|
| 30|    80|  Sagini|
+---+------+--------+



In [None]:
d1.dropDuplicates(["name", "age"]).show()

In [None]:
df1.drop("name").collect()

In [12]:
data = sc.parallelize([("Geoffrey", 28, 30), ("Sagini", 40 ,40), ("Sagini", 30, 80)])

In [16]:
from pyspark.sql.types import *

In [18]:
coltype = StructType([
                 StructField("name", StringType(), True),
                 StructField("age", IntegerType(), True),
                 StructField("height", IntegerType(), True)])

In [19]:
d2 = spark.createDataFrame(data, coltype)

In [20]:
d2.show()

+--------+---+------+
|    name|age|height|
+--------+---+------+
|Geoffrey| 28|    30|
|  Sagini| 40|    40|
|  Sagini| 30|    80|
+--------+---+------+



In [21]:
d2.describe()

DataFrame[summary: string, name: string, age: string, height: string]

In [22]:
d2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: integer (nullable = true)



In [23]:
d2.select(d2.name).show()

+--------+
|    name|
+--------+
|Geoffrey|
|  Sagini|
|  Sagini|
+--------+



In [24]:
d2.select("name").show()

+--------+
|    name|
+--------+
|Geoffrey|
|  Sagini|
|  Sagini|
+--------+



In [None]:
d1.filter(d1.age > 28).join(d2, d1.height == d2.height).groupBy(d1,"name").agg({"age":"min", "height":"min"})

In [26]:
d1.join(d2, d1.age==d2.age, "inner").collect()

[Row(age=28, height=80, name='Geoffrey', name='Geoffrey', age=28, height=30),
 Row(age=30, height=80, name='Sagini', name='Sagini', age=30, height=80)]

In [27]:
d1.join(d2, d1.age==d2.age, "left").collect()

[Row(age=29, height=80, name='Sagini', name=None, age=None, height=None),
 Row(age=28, height=80, name='Geoffrey', name='Geoffrey', age=28, height=30),
 Row(age=30, height=80, name='Sagini', name='Sagini', age=30, height=80)]

In [28]:
d1.join(d2, d1.age==d2.age, "right").collect()

[Row(age=28, height=80, name='Geoffrey', name='Geoffrey', age=28, height=30),
 Row(age=30, height=80, name='Sagini', name='Sagini', age=30, height=80),
 Row(age=None, height=None, name=None, name='Sagini', age=40, height=40)]

In [29]:
d1.join(d2, d1.age==d2.age, "cross").collect()

[Row(age=28, height=80, name='Geoffrey', name='Geoffrey', age=28, height=30),
 Row(age=30, height=80, name='Sagini', name='Sagini', age=30, height=80)]

In [30]:
d1.describe("age").show()

+-------+----+
|summary| age|
+-------+----+
|  count|   3|
|   mean|29.0|
| stddev| 1.0|
|    min|  28|
|    max|  30|
+-------+----+



In [31]:
d2.groupBy("name").agg({"height":"max","age":"min"}).show()

+--------+--------+-----------+
|    name|min(age)|max(height)|
+--------+--------+-----------+
|  Sagini|      30|         80|
|Geoffrey|      28|         30|
+--------+--------+-----------+



In [32]:
d1.createTempView("table")

In [34]:
spark.sql("select name, height from table ").show()

+--------+------+
|    name|height|
+--------+------+
|Geoffrey|    80|
|  Sagini|    80|
|  Sagini|    80|
+--------+------+



In [35]:
from pyspark.ml.linalg import Vectors

In [36]:
Vectors.dense([0,1,2,3,4,5,6,7,8,9,10])

DenseVector([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0])

In [8]:
mySQLDF = spark.read.format("jdbc")\
.option("url", "jdbc:mysql://localhost:3306/bank")\
.option("driver","com.mysql.jdbc.Driver")\
.option("dbtable","account")\
.option("user", "root")\
.option("password","Mog#67sag")\
.load()

Py4JJavaError: An error occurred while calling o150.load.
: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
	at java.net.URLClassLoader.findClass(Unknown Source)
	at java.lang.ClassLoader.loadClass(Unknown Source)
	at java.lang.ClassLoader.loadClass(Unknown Source)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:45)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:79)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:79)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:35)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:340)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Unknown Source)
