In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Toshiba\anaconda3\python.exe -m pip install --upgrade pip' command.


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Spark').getOrCreate()

In [None]:
spark

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

liste = [(1, "Joseph", "Forest", 32, 12000),
        (2, "Tommy", "Happy", 38, 13000),
        (3, "Thomas", "Horse", 30, 11000),
        (4, "Strato", "Wise", 37 , 15000)]

# columns = ["ID","Name","Surname","Age","Salary"]

# df = spark.createDataFrame(liste,columns)

list_schema = StructType([StructField("ID", IntegerType(), False),
                          StructField("Name", StringType(), True),
                          StructField("Surname", StringType(), True),
                          StructField("Age", IntegerType(), True),
                          StructField("Salary", IntegerType(), True)
                          ])

sample_df = spark.createDataFrame(data=liste, schema=list_schema)

In [None]:
sample_df.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- Name: string (nullable = true)
 |-- Surname: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [None]:
sample_df.show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  1|Joseph| Forest| 32| 12000|
|  2| Tommy|  Happy| 38| 13000|
|  3|Thomas|  Horse| 30| 11000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
sample_df.head(2)

[Row(ID=1, Name='Joseph', Surname='Forest', Age=32, Salary=12000),
 Row(ID=2, Name='Tommy', Surname='Happy', Age=38, Salary=13000)]

In [None]:
sample_df.columns

['ID', 'Name', 'Surname', 'Age', 'Salary']

In [None]:
sample_df.describe().show()

+-------+------------------+------+-------+------------------+-----------------+
|summary|                ID|  Name|Surname|               Age|           Salary|
+-------+------------------+------+-------+------------------+-----------------+
|  count|                 4|     4|      4|                 4|                4|
|   mean|               2.5|  null|   null|             34.25|          12750.0|
| stddev|1.2909944487358056|  null|   null|3.8622100754188224|1707.825127659933|
|    min|                 1|Joseph| Forest|                30|            11000|
|    max|                 4| Tommy|   Wise|                38|            15000|
+-------+------------------+------+-------+------------------+-----------------+



In [None]:
sample_df.select(["Name","Salary"]).show()

+------+------+
|  Name|Salary|
+------+------+
|Joseph| 12000|
| Tommy| 13000|
|Thomas| 11000|
|Strato| 15000|
+------+------+



In [None]:
sample_df.head(2)[0].asDict()#["Last Name"]

{'Age': 32, 'ID': 1, 'Name': 'Joseph', 'Salary': 12000, 'Surname': 'Forest'}

In [None]:
sample_df.withColumn("new_salary", sample_df["Salary"]*1.2).show()

+---+------+-------+---+------+----------+
| ID|  Name|Surname|Age|Salary|new_salary|
+---+------+-------+---+------+----------+
|  1|Joseph| Forest| 32| 12000|   14400.0|
|  2| Tommy|  Happy| 38| 13000|   15600.0|
|  3|Thomas|  Horse| 30| 11000|   13200.0|
|  4|Strato|   Wise| 37| 15000|   18000.0|
+---+------+-------+---+------+----------+



In [None]:
from pyspark.sql.functions import col, lit
sample_df.withColumn("new_age", (col("Age")+2).cast("String")).show()

+---+------+-------+---+------+-------+
| ID|  Name|Surname|Age|Salary|new_age|
+---+------+-------+---+------+-------+
|  1|Joseph| Forest| 32| 12000|     34|
|  2| Tommy|  Happy| 38| 13000|     40|
|  3|Thomas|  Horse| 30| 11000|     32|
|  4|Strato|   Wise| 37| 15000|     39|
+---+------+-------+---+------+-------+



In [None]:
sample_df.withColumn("Country",lit("TR")).show()

+---+------+-------+---+------+-------+
| ID|  Name|Surname|Age|Salary|Country|
+---+------+-------+---+------+-------+
|  1|Joseph| Forest| 32| 12000|     TR|
|  2| Tommy|  Happy| 38| 13000|     TR|
|  3|Thomas|  Horse| 30| 11000|     TR|
|  4|Strato|   Wise| 37| 15000|     TR|
+---+------+-------+---+------+-------+



In [None]:
sample_df.select(col("Name"),col("Surname"),lit("TR").alias("Country")).show()

+------+-------+-------+
|  Name|Surname|Country|
+------+-------+-------+
|Joseph| Forest|     TR|
| Tommy|  Happy|     TR|
|Thomas|  Horse|     TR|
|Strato|   Wise|     TR|
+------+-------+-------+



In [None]:
from pyspark.sql.functions import when

sample_df.withColumn("Salary Level", when(col("Salary") >12000,lit("High")).otherwise(lit("Normal"))).show()



+---+------+-------+---+------+------------+
| ID|  Name|Surname|Age|Salary|Salary Level|
+---+------+-------+---+------+------------+
|  1|Joseph| Forest| 32| 12000|      Normal|
|  2| Tommy|  Happy| 38| 13000|        High|
|  3|Thomas|  Horse| 30| 11000|      Normal|
|  4|Strato|   Wise| 37| 15000|        High|
+---+------+-------+---+------+------------+



In [None]:
sample_df.show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  1|Joseph| Forest| 32| 12000|
|  2| Tommy|  Happy| 38| 13000|
|  3|Thomas|  Horse| 30| 11000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
sample_df.withColumnRenamed("Last Name","Surname").show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  1|Joseph| Forest| 32| 12000|
|  2| Tommy|  Happy| 38| 13000|
|  3|Thomas|  Horse| 30| 11000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
sample_df.withColumnRenamed("Last Name","Surname") \
         .withColumnRenamed("ID", "Number").show()

+------+------+-------+---+------+
|Number|  Name|Surname|Age|Salary|
+------+------+-------+---+------+
|     1|Joseph| Forest| 32| 12000|
|     2| Tommy|  Happy| 38| 13000|
|     3|Thomas|  Horse| 30| 11000|
|     4|Strato|   Wise| 37| 15000|
+------+------+-------+---+------+



In [None]:
sample_df.drop("ID").show()

+------+-------+---+------+
|  Name|Surname|Age|Salary|
+------+-------+---+------+
|Joseph| Forest| 32| 12000|
| Tommy|  Happy| 38| 13000|
|Thomas|  Horse| 30| 11000|
|Strato|   Wise| 37| 15000|
+------+-------+---+------+



In [None]:
df_collect = sample_df.collect()

In [None]:
df_collect#[1][2]

[Row(ID=1, Name='Joseph', Surname='Forest', Age=32, Salary=12000),
 Row(ID=2, Name='Tommy', Surname='Happy', Age=38, Salary=13000),
 Row(ID=3, Name='Thomas', Surname='Horse', Age=30, Salary=11000),
 Row(ID=4, Name='Strato', Surname='Wise', Age=37, Salary=15000)]

In [None]:
for row in df_collect:
  print(row["Name"] + " " + row["Surname"])

Joseph Forest
Tommy Happy
Thomas Horse
Strato Wise


In [None]:
sample_df.createOrReplaceTempView("personel")

In [None]:
query = spark.sql("SELECT * FROM personel WHERE Salary>12000")
query.show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  2| Tommy|  Happy| 38| 13000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
sample_df.filter("Age<38").show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  1|Joseph| Forest| 32| 12000|
|  3|Thomas|  Horse| 30| 11000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
sample_df.filter(sample_df["Age"]<38).select(["Name", "Salary"]).show()

+------+------+
|  Name|Salary|
+------+------+
|Joseph| 12000|
|Thomas| 11000|
|Strato| 15000|
+------+------+



In [None]:
sample_df.filter( (sample_df["Age"]<38) & (sample_df["Salary"]<15000) ).select(["Name", "Salary"]).show()

+------+------+
|  Name|Salary|
+------+------+
|Joseph| 12000|
|Thomas| 11000|
+------+------+



In [None]:
sales_df = spark.read.csv("sales.csv",header=True, inferSchema=True)
sales_df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
sales_df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [None]:
sales_df.groupBy("Company").max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [None]:
sales_df.agg({"sales":"mean"}).show()

+-----------------+
|       avg(sales)|
+-----------------+
|360.5833333333333|
+-----------------+



In [None]:
groupdf = sales_df.groupBy("Company")
groupdf.agg({"sales":"max"}).show()

+-------+----------+
|Company|max(sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [None]:
sales_df.groupBy("Company").count().orderBy("count").show() ##  value_counts

+-------+-----+
|Company|count|
+-------+-----+
|     FB|    2|
|   GOOG|    3|
|   MSFT|    3|
|   APPL|    4|
+-------+-----+



In [None]:
from pyspark.sql.functions import countDistinct, avg, stddev, format_number
sales_df.select(avg("sales").alias("Average Sales")).show()


+-----------------+
|    Average Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [None]:
avg_sale = sales_df.select(avg("sales").alias("Average Sales"))
avg_sale.select(format_number("Average Sales",2).alias("Averga Sales")).show()

+------------+
|Averga Sales|
+------------+
|      360.58|
+------------+



In [None]:
sales_df.orderBy("Sales").show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [None]:
sales_df.orderBy(sales_df["Sales"].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



In [None]:
liste2 = [(1, "Joseph", "Forest", 32, None),
          (2, "Tommy", None,  38, 13000),
          (3, "Thomas", None, None, None),
          (4, "Strato", "Wise", 37 , 15000)]

columns2 = ["ID","Name","Surname","Age","Salary"]

missing_df = spark.createDataFrame(liste2,columns2)

In [None]:
missing_df.show()

+---+------+-------+----+------+
| ID|  Name|Surname| Age|Salary|
+---+------+-------+----+------+
|  1|Joseph| Forest|  32|  null|
|  2| Tommy|   null|  38| 13000|
|  3|Thomas|   null|null|  null|
|  4|Strato|   Wise|  37| 15000|
+---+------+-------+----+------+



In [None]:
missing_df.na.drop().show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
missing_df.na.drop(subset=["Salary"]).show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  2| Tommy|   null| 38| 13000|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
missing_df.na.fill("fill value").show()

+---+------+----------+----+------+
| ID|  Name|   Surname| Age|Salary|
+---+------+----------+----+------+
|  1|Joseph|    Forest|  32|  null|
|  2| Tommy|fill value|  38| 13000|
|  3|Thomas|fill value|null|  null|
|  4|Strato|      Wise|  37| 15000|
+---+------+----------+----+------+



In [None]:
missing_df.na.fill(0).show()

+---+------+-------+---+------+
| ID|  Name|Surname|Age|Salary|
+---+------+-------+---+------+
|  1|Joseph| Forest| 32|     0|
|  2| Tommy|   null| 38| 13000|
|  3|Thomas|   null|  0|     0|
|  4|Strato|   Wise| 37| 15000|
+---+------+-------+---+------+



In [None]:
from pyspark.sql.functions import mean
avg_sal = missing_df.select(mean("Salary")).collect()
mean_sal = avg_sal[0][0]

In [None]:
missing_df.na.fill(mean_sal, subset="Salary").show()

+---+------+-------+----+------+
| ID|  Name|Surname| Age|Salary|
+---+------+-------+----+------+
|  1|Joseph| Forest|  32| 14000|
|  2| Tommy|   null|  38| 13000|
|  3|Thomas|   null|null| 14000|
|  4|Strato|   Wise|  37| 15000|
+---+------+-------+----+------+



In [None]:
stock_df = spark.read.csv("apple.csv", header=True, inferSchema=True )
stock_df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [None]:
from pyspark.sql.functions import year

stock_df.select(year("Date")).show()

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 20 rows



In [None]:
new_stock = stock_df.withColumn("Year", year("Date")).select(["Year", "Close"])

In [None]:
new_stock.groupBy("Year").mean().select(["Year", "avg(Close)"]).withColumnRenamed("avg(Close)", "Average Close").orderBy("Year").show()

+----+------------------+
|Year|     Average Close|
+----+------------------+
|2010| 259.8424600000002|
|2011|364.00432532142867|
|2012| 576.0497195640002|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2015|120.03999980555547|
|2016|104.60400786904763|
+----+------------------+

