In [6]:
# standard python libraries
import numpy as np
import pandas as pd

# importing pyspark library
import pyspark

# creating the spark object that activates the spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# importing pyspark functions
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# importing data for practice
from pydataset import data

## Loading data and writing to DataFrame

> ### Pandas

In [5]:
# importing mpg data for pandas df 
df = data('mpg')
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


> ### Spark

In [9]:
# using spark object to read pandas df to spark df
spark = spark.createDataFrame(data('mpg'))
spark.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



## Renaming columns

> ### Pandas

In [13]:
# looking at the list of columns
list(df.columns)

['manufacturer',
 'model',
 'displ',
 'year',
 'cyl',
 'trans',
 'drv',
 'cty',
 'hwy',
 'fl',
 'class']

In [14]:
# changing column names
df.columns = ['manni_factor',
 'moh_del',
 'dis_place',
 'yr',
 'cyl_ly',
 'trans_missme',
 'dr_vv',
 'city',
 'hiiiii_weigh',
 'fl?',
 'type']

# previewing first 5 rows to verify col names changed
df.head(5)

Unnamed: 0,manni_factor,moh_del,dis_place,yr,cyl_ly,trans_missme,dr_vv,city,hiiiii_weigh,fl?,type
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


> ### Spark

In [19]:
# changing col names
spark = spark.toDF('manni_factor',
 'moh_del',
 'dis_place',
 'yr',
 'cyl_ly',
 'trans_missme',
 'dr_vv',
 'city',
 'hiiiii_weigh',
 'fl?',
 'type')

# previewing first 5 rows to verify col names changed
spark.show(5)

+------------+-------+---------+----+------+------------+-----+----+------------+---+-------+
|manni_factor|moh_del|dis_place|  yr|cyl_ly|trans_missme|dr_vv|city|hiiiii_weigh|fl?|   type|
+------------+-------+---------+----+------+------------+-----+----+------------+---+-------+
|        audi|     a4|      1.8|1999|     4|    auto(l5)|    f|  18|          29|  p|compact|
|        audi|     a4|      1.8|1999|     4|  manual(m5)|    f|  21|          29|  p|compact|
|        audi|     a4|      2.0|2008|     4|  manual(m6)|    f|  20|          31|  p|compact|
|        audi|     a4|      2.0|2008|     4|    auto(av)|    f|  21|          30|  p|compact|
|        audi|     a4|      2.8|1999|     6|    auto(l5)|    f|  16|          26|  p|compact|
+------------+-------+---------+----+------+------------+-----+----+------------+---+-------+
only showing top 5 rows



## Dropping columns

> ### Pandas

In [21]:
# dropping fl column, not sure what fl is
df = df.drop('fl?', axis = 1)
df.head()

Unnamed: 0,manni_factor,moh_del,dis_place,yr,cyl_ly,trans_missme,dr_vv,city,hiiiii_weigh,type
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,compact
...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,midsize


> ### Spark

In [23]:
spark = spark.drop('fl?')
spark.show(5)

+------------+-------+---------+----+------+------------+-----+----+------------+-------+
|manni_factor|moh_del|dis_place|  yr|cyl_ly|trans_missme|dr_vv|city|hiiiii_weigh|   type|
+------------+-------+---------+----+------+------------+-----+----+------------+-------+
|        audi|     a4|      1.8|1999|     4|    auto(l5)|    f|  18|          29|compact|
|        audi|     a4|      1.8|1999|     4|  manual(m5)|    f|  21|          29|compact|
|        audi|     a4|      2.0|2008|     4|  manual(m6)|    f|  20|          31|compact|
|        audi|     a4|      2.0|2008|     4|    auto(av)|    f|  21|          30|compact|
|        audi|     a4|      2.8|1999|     6|    auto(l5)|    f|  16|          26|compact|
+------------+-------+---------+----+------+------------+-----+----+------------+-------+
only showing top 5 rows

