In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
.builder \
.appName("Python Spark create RDD example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

In [4]:
df = spark.sparkContext.parallelize([(1, 2, 3, 'a b c'),
(4, 5, 6, 'd e f'),
(7, 8, 9, 'g h i')]).toDF(['col1', 'col2', 'col3','col4'])


In [5]:
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [6]:
#Reading from the file and creating dataframe
df = spark.read.csv("data/Artists.csv",header = "true")

In [7]:
df.show(5)

+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
|ConstituentID|    DisplayName|          ArtistBio|Nationality|Gender|BeginDate|EndDate|Wiki QID|     ULAN|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
|            1| Robert Arneson|American, 1930–1992|   American|  Male|     1930|   1992|    null|     null|
|            2| Doroteo Arnaiz| Spanish, born 1936|    Spanish|  Male|     1936|      0|    null|     null|
|            3|    Bill Arnold|American, born 1941|   American|  Male|     1941|      0|    null|     null|
|            4|Charles Arnoldi|American, born 1946|   American|  Male|     1946|      0|Q1063584|500027998|
|            5|    Per Arnoldi|  Danish, born 1941|     Danish|  Male|     1941|      0|    null|     null|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- ConstituentID: string (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- ArtistBio: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- BeginDate: string (nullable = true)
 |-- EndDate: string (nullable = true)
 |-- Wiki QID: string (nullable = true)
 |-- ULAN: string (nullable = true)



In [9]:
mapping = {'DisplayName':"ShowName"}
newNames = [mapping.get(col,col) for col in df.columns]
df.toDF(*newNames).show(5)

+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
|ConstituentID|       ShowName|          ArtistBio|Nationality|Gender|BeginDate|EndDate|Wiki QID|     ULAN|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
|            1| Robert Arneson|American, 1930–1992|   American|  Male|     1930|   1992|    null|     null|
|            2| Doroteo Arnaiz| Spanish, born 1936|    Spanish|  Male|     1936|      0|    null|     null|
|            3|    Bill Arnold|American, born 1941|   American|  Male|     1941|      0|    null|     null|
|            4|Charles Arnoldi|American, born 1946|   American|  Male|     1946|      0|Q1063584|500027998|
|            5|    Per Arnoldi|  Danish, born 1941|     Danish|  Male|     1941|      0|    null|     null|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+
only showing top 5 rows



In [10]:
df[df.Nationality=="Indian"].show()

+-------------+--------------------+-------------------+-----------+------+---------+-------+--------+---------+
|ConstituentID|         DisplayName|          ArtistBio|Nationality|Gender|BeginDate|EndDate|Wiki QID|     ULAN|
+-------------+--------------------+-------------------+-----------+------+---------+-------+--------+---------+
|          544|         Jyoti Bhatt|  Indian, born 1934|     Indian|  Male|     1934|      0|    null|     null|
|         2051| Vasudeo S. Gaitonde|  Indian, 1924–2001|     Indian|  Male|     1924|   2001|    null|     null|
|         2411|       Satish Gujral|  Indian, born 1925|     Indian|  Male|     1925|      0|Q7426282|500091206|
|         3078|   Bhupen P. Khakhar|  Indian, 1934–2003|     Indian|  Male|     1934|   2003|    null|     null|
|         3079|      Krishen Khanna|  Indian, born 1925|     Indian|  Male|     1925|      0|    null|     null|
|         4836|    N. Krishna Reddy|Indian, 1925 - 2018|     Indian|  Male|     1925|   2018|   

In [11]:
import pyspark.sql.functions as F
df.withColumn("EndDateNorm",df.EndDate/df.groupBy().agg(F.sum("EndDate")).collect()[0][0]).show(4)

+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+--------------------+
|ConstituentID|    DisplayName|          ArtistBio|Nationality|Gender|BeginDate|EndDate|Wiki QID|     ULAN|         EndDateNorm|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+---------+--------------------+
|            1| Robert Arneson|American, 1930–1992|   American|  Male|     1930|   1992|    null|     null|2.009788030925310...|
|            2| Doroteo Arnaiz| Spanish, born 1936|    Spanish|  Male|     1936|      0|    null|     null|                 0.0|
|            3|    Bill Arnold|American, born 1941|   American|  Male|     1941|      0|    null|     null|                 0.0|
|            4|Charles Arnoldi|American, born 1946|   American|  Male|     1946|      0|Q1063584|500027998|                 0.0|
+-------------+---------------+-------------------+-----------+------+---------+-------+--------+

In [12]:
df.groupBy(['Gender']).agg({'Nationality': 'count'}).show()

+------+------------------+
|Gender|count(Nationality)|
+------+------------------+
|  null|              1355|
|Female|              2203|
|female|                 1|
|  male|                15|
|  Male|              9747|
+------+------------------+



In [19]:
german = spark.read.csv("Data 2/german_credit.csv",header = True)

df.columns

In [21]:
german.columns

['Creditability',
 'Account Balance',
 'Duration of Credit (month)',
 'Payment Status of Previous Credit',
 'Purpose',
 'Credit Amount',
 'Value Savings/Stocks',
 'Length of current employment',
 'Instalment per cent',
 'Sex & Marital Status',
 'Guarantors',
 'Duration in Current address',
 'Most valuable available asset',
 'Age (years)',
 'Concurrent Credits',
 'Type of apartment',
 'No of Credits at this Bank',
 'Occupation',
 'No of dependents',
 'Telephone',
 'Foreign Worker']

In [22]:
num_cols = ['Account Balance','No of dependents']
german.select(num_cols).describe().show()

+-------+------------------+-------------------+
|summary|   Account Balance|   No of dependents|
+-------+------------------+-------------------+
|  count|              1000|               1000|
|   mean|             2.577|              1.155|
| stddev|1.2576377271108936|0.36208577175319395|
|    min|                 1|                  1|
|    max|                 4|                  2|
+-------+------------------+-------------------+



In [33]:
from pyspark.sql.functions import skewness, kurtosis
german.select(skewness("Account Balance"), kurtosis("Account Balance")).show()

+-------------------------+-------------------------+
|skewness(Account Balance)|kurtosis(Account Balance)|
+-------------------------+-------------------------+
|     0.006946592744377805|      -1.6613901748002375|
+-------------------------+-------------------------+



In [31]:
from pyspark.sql import functions as F
from pyspark.sql.functions import rank,sum,col
pivot_by_age_amount = german.select(["Age (years)","Credit Amount"]) \
                            .groupBy("Age (years)") \
                            .agg(F.count("Credit Amount").alias("Credit_num"), \
                            F.mean('Credit Amount').alias('Credit_avg'), \
                            F.min('Credit Amount').alias('Credit_min'),
                            F.max('Credit Amount').alias('Credit_max'))
                        

In [48]:
pivot_by_age_amount.show()

+-----------+----------+------------------+----------+----------+
|Age (years)|Credit_num|        Credit_avg|Credit_min|Credit_max|
+-----------+----------+------------------+----------+----------+
|         51|         8|            3058.0|      1164|      7511|
|         54|        10|            2661.0|      1318|      7432|
|         29|        37| 3509.945945945946|      1103|       959|
|         42|        22| 4211.318181818182|     10366|      8318|
|         64|         5|            1948.4|      1364|       753|
|         30|        40|          3966.875|      1055|       960|
|         34|        33| 3387.242424242424|     11998|      6999|
|         59|         3|            4275.0|      1364|      6416|
|         22|        27|2109.5925925925926|      1007|       806|
|         28|        43|2934.6976744186045|      1068|      9572|
|         35|        40|           3216.95|      1050|       976|
|         52|         9| 2063.222222222222|      2133|       936|
|         