## RDD Example:

Read the simple list of [1,2,3,4,5] and compute summation  

In [4]:
from pyspark import SparkContext

In [5]:
sc = SparkContext.getOrCreate()

In [6]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

In [7]:
distData.collect()# fetch the entire rdd to a single machine 


[1, 2, 3, 4, 5]

In [8]:
distData.take(2) # fetch only the first two elements

[1, 2]

In [9]:
distData.reduce(lambda a, b: a + b)


15

## DataFrame Spark SQL 

Read a CSV data from Kaggle's <a href="https://www.kaggle.com/mlg-ulb/creditcardfraud/data"> Credit Card Fraud dataset </a> in to a spark DataFrame. 

In [10]:
import pyspark as spark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
#For pandas like visualization - no need to use df.show()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) 

In [11]:
df = spark.read.csv('creditcard.csv', header=True)
#header=True says that the first line is column name not values

In [12]:


df

Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,0
0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.0089830991432281,0.0147241691924927,2.69,0
1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,0
1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.0052735967825345,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,0
2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.0094306971323291,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,0
2,-0.425965884412454,0.960523044882985,1.14110934232219,-0.168252079760302,0.42098688077219,-0.0297275516639742,0.476200948720027,0.260314333074874,-0.56867137571251,-0.371407196834471,1.34126198001957,0.359893837038039,-0.358090652573631,-0.137133700217612,0.517616806555742,0.401725895589603,-0.0581328233640131,0.0686531494425432,-0.0331937877876282,0.0849676720682049,-0.208253514656728,-0.559824796253248,-0.0263976679795373,-0.371426583174346,-0.232793816737034,0.105914779097957,0.253844224739337,0.0810802569229443,3.67,0
4,1.22965763450793,0.141003507049326,0.0453707735899449,1.20261273673594,0.191880988597645,0.272708122899098,-0.0051590028825098,0.0812129398830894,0.464959994783886,-0.0992543211289237,-1.41690724314928,-0.153825826253651,-0.75106271556262,0.16737196252175,0.0501435942254188,-0.443586797916727,0.002820512472347,-0.61198733994012,-0.0455750446637976,-0.21963255278686,-0.167716265815783,-0.270709726172363,-0.154103786809305,-0.780055415004671,0.75013693580659,-0.257236845917139,0.0345074297438413,0.0051677689062491,4.99,0
7,-0.644269442348146,1.41796354547385,1.0743803763556,-0.492199018495015,0.948934094764157,0.428118462833089,1.12063135838353,-3.80786423873589,0.615374730667027,1.24937617815176,-0.619467796121913,0.291474353088705,1.75796421396042,-1.32386521970526,0.686132504394383,-0.0761269994382006,-1.2221273453247,-0.358221569869078,0.324504731321494,-0.156741852488285,1.94346533978412,-1.01545470979971,0.057503529867291,-0.649709005559993,-0.415266566234811,-0.0516342969262494,-1.20692108094258,-1.08533918832377,40.8,0
7,-0.89428608220282,0.286157196276544,-0.113192212729871,-0.271526130088604,2.6695986595986,3.72181806112751,0.370145127676916,0.851084443200905,-0.392047586798604,-0.410430432848439,-0.705116586646536,-0.110452261733098,-0.286253632470583,0.0743553603016731,-0.328783050303565,-0.210077268148783,-0.499767968800267,0.118764861004217,0.57032816746536,0.0527356691149697,-0.0734251001059225,-0.268091632235551,-0.204232669947878,1.0115918018785,0.373204680146282,-0.384157307702294,0.0117473564581996,0.14240432992147,93.2,0
9,-0.33826175242575,1.11959337641566,1.04436655157316,-0.222187276738296,0.49936080649727,-0.24676110061991,0.651583206489972,0.0695385865186387,-0.736727316364109,-0.366845639206541,1.01761446783262,0.836389570307029,1.00684351373408,-0.443522816876142,0.150219101422635,0.739452777052119,-0.540979921943059,0.47667726004282,0.451772964394125,0.203711454727929,-0.246913936910008,-0.633752642406113,-0.12079408408185,-0.385049925313426,-0.0697330460416923,0.0941988339514961,0.246219304619926,0.0830756493473326,3.68,0


In [13]:
df.printSchema() # shows us the columns name and their data type.


root
 |-- Time: string (nullable = true)
 |-- V1: string (nullable = true)
 |-- V2: string (nullable = true)
 |-- V3: string (nullable = true)
 |-- V4: string (nullable = true)
 |-- V5: string (nullable = true)
 |-- V6: string (nullable = true)
 |-- V7: string (nullable = true)
 |-- V8: string (nullable = true)
 |-- V9: string (nullable = true)
 |-- V10: string (nullable = true)
 |-- V11: string (nullable = true)
 |-- V12: string (nullable = true)
 |-- V13: string (nullable = true)
 |-- V14: string (nullable = true)
 |-- V15: string (nullable = true)
 |-- V16: string (nullable = true)
 |-- V17: string (nullable = true)
 |-- V18: string (nullable = true)
 |-- V19: string (nullable = true)
 |-- V20: string (nullable = true)
 |-- V21: string (nullable = true)
 |-- V22: string (nullable = true)
 |-- V23: string (nullable = true)
 |-- V24: string (nullable = true)
 |-- V25: string (nullable = true)
 |-- V26: string (nullable = true)
 |-- V27: string (nullable = true)
 |-- V28: string (nulla

In [14]:
df.head(2)


[Row(Time='0', V1='-1.3598071336738', V2='-0.0727811733098497', V3='2.53634673796914', V4='1.37815522427443', V5='-0.338320769942518', V6='0.462387777762292', V7='0.239598554061257', V8='0.0986979012610507', V9='0.363786969611213', V10='0.0907941719789316', V11='-0.551599533260813', V12='-0.617800855762348', V13='-0.991389847235408', V14='-0.311169353699879', V15='1.46817697209427', V16='-0.470400525259478', V17='0.207971241929242', V18='0.0257905801985591', V19='0.403992960255733', V20='0.251412098239705', V21='-0.018306777944153', V22='0.277837575558899', V23='-0.110473910188767', V24='0.0669280749146731', V25='0.128539358273528', V26='-0.189114843888824', V27='0.133558376740387', V28='-0.0210530534538215', Amount='149.62', Class='0'),
 Row(Time='0', V1='1.19185711131486', V2='0.26615071205963', V3='0.16648011335321', V4='0.448154078460911', V5='0.0600176492822243', V6='-0.0823608088155687', V7='-0.0788029833323113', V8='0.0851016549148104', V9='-0.255425128109186', V10='-0.166974414

In [15]:
df.count()


284807

**Statistical summary on numerical columns can be computed by describe():**

In [16]:
df.describe()


summary,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807,284807.0,284807.0
mean,94813.85957508069,2.235360406313924...,6.865749819392767...,-5.82471054445228...,2.011824365682531...,3.704311530463074...,1.140033807220101...,-1.14961392324716...,-2.9538691083434E-16,-2.08207854988096...,2.145945990061367...,-1.27734880360795...,-3.30833340134460...,1.034652530922445...,1.708454024825641...,5.933285192758958...,1.394705224939437...,-2.87403480811790...,1.749967860942900...,9.38053027649593E-16,3.81607955077877E-16,2.850084518050253...,-7.66409282164773...,2.9538691083434E-16,4.454953538333312...,1.034652530922445...,1.808246900107513...,-3.52368642620288...,-1.13364706320206...,88.3496192509521,0.00172748563062
stddev,47488.14595456632,1.9586958038574904,1.6513085794769997,1.5162550051777732,1.415868574940927,1.380246734031437,1.3322710897575714,1.2370935981826632,1.1943529026692048,1.0986320892243222,1.0888497654025215,1.0207130277115581,0.9992013895301415,0.9952742301251558,0.9585956112570617,0.9153160116104389,0.8762528873883704,0.84933706367439,0.8381762095288453,0.8140405007685797,0.7709250248871159,0.7345240143713125,0.7257015604409107,0.6244602955949898,0.6056470678271603,0.5212780705409427,0.48222701326105666,0.4036324949650313,0.33008326416025036,250.1201092401885,0.0415271896354649
min,0.0,-0.00012931370800...,-0.00010296722561...,-0.00010859127517...,-0.00011921826106...,-0.00010366562678...,-0.00010234903761...,-0.00010533581684...,-0.00010065655617...,-0.00010181309940...,-0.00010546343208...,-0.00010815155961...,-0.00010708756424...,-0.0001012565441717,-0.00011026584350728,-0.00010544999800...,-0.00010218720356...,-0.00010491602545428,-0.00010486089568...,-0.00010590824777...,-0.00010321507601...,-0.00010275802070302,-0.00011087523222...,-0.00010305543062...,-0.00010489042033...,-0.00010530735344...,-0.00011466656096...,-0.00010053247847...,-0.00010022573314...,0.0,0.0
max,99999.0,7.55406974741191e-05,9.99769856171626,9.67444968403876e-05,9.92501936512661,9.99846034625664e-05,9.91116576052911,9.97044721041161,9.90825458583455,9.96754672601408e-05,9.990175905094,9.93981974172569,9.98580333033583e-05,9.42995346153325e-05,9.42849461347088e-05,8.92751304924777e-05,9.62125212195424e-05,9.99380070915989e-05,9.72652650759061e-06,9.83848235865115e-06,9.94641486230186,9.97908705775956,9.8395225212225e-05,9.97423880847133,9.98525037704461e-05,9.7739127160934e-05,9.53790653973052e-05,9.94394238269967e-05,9.9271564256982e-06,999.9,1.0


In [17]:
df.dropna().count()

284807

## Map RDD to DataFrame using Row
Read <a href=" https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html"> California housing dataset </a> into an RDD then map it to the spark DataFrame. 

In [18]:
import pyspark as spark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

#For pandas like visualization - no need to use df.show()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) 

sc =  spark.sparkContext

In [19]:
rdd = sc.textFile('cal_housing.data')
rdd = rdd.map(lambda line:line.split(",")) # split the lines
#rdd.collect() # to print all lines
rdd.take(2)    # to print only 2 first lines

[['-122.230000',
  '37.880000',
  '41.000000',
  '880.000000',
  '129.000000',
  '322.000000',
  '126.000000',
  '8.325200',
  '452600.000000'],
 ['-122.220000',
  '37.860000',
  '21.000000',
  '7099.000000',
  '1106.000000',
  '2401.000000',
  '1138.000000',
  '8.301400',
  '358500.000000']]

In [20]:
from pyspark.sql import Row
df =rdd.map(lambda line: Row(longitude=line[0],
                            lattitude=line[1],
                            housingMedAge=line[2],
                            totalRooms=line[3],
                            totalBedR=line[4],
                            population=line[5],
                            households=line[6],
                            medianIncome=line[7],
                            medianHouseValue=line[8])).toDF()

In [21]:
df.columns


['longitude',
 'lattitude',
 'housingMedAge',
 'totalRooms',
 'totalBedR',
 'population',
 'households',
 'medianIncome',
 'medianHouseValue']

In [22]:
df.describe()

summary,longitude,lattitude,housingMedAge,totalRooms,totalBedR,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.5697044573641,35.631861434108586,28.639486434108527,2635.7630813953488,537.8980135658915,1425.4767441860463,499.5396802325581,3.870671002906978,206855.81690891477
stddev,2.003531723502633,2.1359523974571237,12.585557612111613,2181.6152515827994,421.2479059431315,1132.4621217653385,382.32975283161136,1.8998217179452637,115395.6158744138
min,-114.31,32.54,1.0,100.0,1.0,100.0,1.0,0.4999,100000.0
max,-124.35,41.95,9.0,9998.0,999.0,999.0,999.0,9.9071,99900.0


In [23]:

df

longitude,lattitude,housingMedAge,totalRooms,totalBedR,population,households,medianIncome,medianHouseValue
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0


## Access a column, adding a new column
Remember that operations in spark DataFrame are **lazy** in nature. However, in pandas we get the result as soon as we apply any operation.

In [24]:
df['households']
df[6]
df.households

Column<b'households'>

**Add a new column to the df:**

In [25]:
df.withColumn('housePop', df.households/df.population)

longitude,lattitude,housingMedAge,totalRooms,totalBedR,population,households,medianIncome,medianHouseValue,housePop
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.391304347826087
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.4739691795085381
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.3568548387096774
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.3924731182795699
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.4584070796460177
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,0.4673123486682808
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,0.469835466179159
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,0.5592048401037165
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,0.4933665008291873
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,0.460348162475822


In [26]:
df.select('households','population')

households,population
126.0,322.0
1138.0,2401.0
177.0,496.0
219.0,558.0
259.0,565.0
193.0,413.0
514.0,1094.0
647.0,1157.0
595.0,1206.0
714.0,1551.0
