In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
import multiprocessing

In [2]:
numbers = [1,2,3,4,5]

In [3]:
def quadratic_mean(num_list):
    num_list = np.array(num_list)
    rms = np.sqrt(np.mean(num_list**2))
    print(rms)

In [4]:
quadratic_mean(numbers)

3.3166247903554


In [5]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

### 1. Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.

- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [6]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

- Show the first 3 rows of the dataframe.

In [7]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



- Show the first 7 rows of the dataframe.

In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



- View a summary of the data using .describe.

In [9]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [10]:
new_df = df['n','abool']
new_df.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [11]:
new_df2 = df.select('group','abool')
new_df2.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [12]:
rename = new_df2.select('group',new_df2.abool.alias('a_boolean_value'))
rename.show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [13]:
df.select('group',df.n.alias('a_numeric_value')).show(5)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
+-----+--------------------+
only showing top 5 rows



### 2. Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

- Use .select to add 4 to the n column. Show the results.

In [14]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
| 4.532337888294546|
| 5.350187899722527|
|  4.86121137416932|
| 5.478685737435897|
| 2.954622869461466|
|3.2110109750484512|
| 2.738394054680931|
| 4.562846785281032|
|3.7566737481144377|
| 4.913740704859677|
| 4.317350922736336|
| 4.127303280206981|
| 6.150382967381113|
| 4.606288656896298|
|3.9732283500135592|
+------------------+



- Subtract 5 from the n column and view the results.

In [15]:
df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
| -4.467662111705454|
|-3.6498121002774733|
|  -4.13878862583068|
| -3.521314262564103|
| -6.045377130538534|
| -5.788989024951549|
| -6.261605945319069|
| -4.437153214718968|
| -5.243326251885563|
| -4.086259295140323|
| -4.682649077263664|
| -4.872696719793019|
|-2.8496170326188874|
| -4.393711343103702|
| -5.026771649986441|
+-------------------+



- Multiply the n column by 2. View the results along with the original numbers.

In [16]:
df.select('n',df.n * 2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
|  0.5323378882945463|  1.0646757765890926|
|  1.3501878997225267|  2.7003757994450535|
|  0.8612113741693206|  1.7224227483386412|
|  1.4786857374358966|   2.957371474871793|
| -1.0453771305385342| -2.0907542610770684|
| -0.7889890249515489| -1.5779780499030978|
|  -1.261605945319069|  -2.523211890638138|
|  0.5628467852810314|  1.1256935705620628|
|-0.24332625188556253|-0.48665250377112507|
|  0.9137407048596775|   1.827481409719355|
| 0.31735092273633597|  0.6347018454726719|
| 0.12730328020698067| 0.25460656041396135|
|  2.1503829673811126|   4.300765934762225|
|  0.6062886568962988|  1.2125773137925977|
|-0.02677164998644...|-0.0535432

- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [17]:
df.select('n','group','abool',(df.n*2).alias('n2')).show(4)

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  -0.712390662050588|    z|false|  -1.424781324101176|
|   0.753766378659703|    x|false|   1.507532757319406|
|-0.04450307833805...|    z|false|-0.08900615667610691|
| 0.45181233874578974|    y|false|  0.9036246774915795|
+--------------------+-----+-----+--------------------+
only showing top 4 rows



- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [18]:
df.select('n','group','abool',(df.n*2).alias('n2'),(df.n**2).alias('n3')).show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|  -1.424781324101176|   0.507500455376875|
|   0.753766378659703|    x|false|   1.507532757319406|  0.5681637535977627|
|-0.04450307833805...|    z|false|-0.08900615667610691|0.001980523981562...|
| 0.45181233874578974|    y|false|  0.9036246774915795| 0.20413438944294027|
|  1.3451017084510097|    z|false|  2.6902034169020195|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



- What happens when you run the code below?

    df.group + df.abool

- What happens when you run the code below? What is the difference between this and the previous code sample?
df.select(df.group + df.abool)

- Try adding various other columns together. What are the results of combining the different data types?

### 3. Type casting

- Use the starter code above to re-create a spark dataframe.

- Use .printSchema to view the datatypes in your dataframe.

In [19]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



- Use .dtypes to view the datatypes in your dataframe.

In [21]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

- What is the difference between the two code samples below?

    df.abool.cast('int')
    df.select(df.abool.cast('int')).show()
    

In [22]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [23]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



In [28]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Use .select and .cast to convert the abool column to an integer type. View the results.

In [30]:
from pyspark.sql.functions import asc, desc, col

In [33]:
df.select('*',df.abool.cast('int').alias('bool_int')).show()

+--------------------+-----+-----+--------+
|                   n|group|abool|bool_int|
+--------------------+-----+-----+--------+
|  -0.712390662050588|    z|false|       0|
|   0.753766378659703|    x|false|       0|
|-0.04450307833805...|    z|false|       0|
| 0.45181233874578974|    y|false|       0|
|  1.3451017084510097|    z|false|       0|
|  0.5323378882945463|    y|false|       0|
|  1.3501878997225267|    z|false|       0|
|  0.8612113741693206|    x|false|       0|
|  1.4786857374358966|    z| true|       1|
| -1.0453771305385342|    y| true|       1|
| -0.7889890249515489|    x|false|       0|
|  -1.261605945319069|    y|false|       0|
|  0.5628467852810314|    y| true|       1|
|-0.24332625188556253|    y| true|       1|
|  0.9137407048596775|    y|false|       0|
| 0.31735092273633597|    x|false|       0|
| 0.12730328020698067|    z|false|       0|
|  2.1503829673811126|    y| true|       1|
|  0.6062886568962988|    x|false|       0|
|-0.02677164998644...|    x| tru

- Convert the group column to a integer data type and view the results. What happens?

In [34]:
#string to null
df.select('*',df.abool.cast('int').alias('bool_int'),df.group.cast('int')).show()

+--------------------+-----+-----+--------+-----+
|                   n|group|abool|bool_int|group|
+--------------------+-----+-----+--------+-----+
|  -0.712390662050588|    z|false|       0| null|
|   0.753766378659703|    x|false|       0| null|
|-0.04450307833805...|    z|false|       0| null|
| 0.45181233874578974|    y|false|       0| null|
|  1.3451017084510097|    z|false|       0| null|
|  0.5323378882945463|    y|false|       0| null|
|  1.3501878997225267|    z|false|       0| null|
|  0.8612113741693206|    x|false|       0| null|
|  1.4786857374358966|    z| true|       1| null|
| -1.0453771305385342|    y| true|       1| null|
| -0.7889890249515489|    x|false|       0| null|
|  -1.261605945319069|    y|false|       0| null|
|  0.5628467852810314|    y| true|       1| null|
|-0.24332625188556253|    y| true|       1| null|
|  0.9137407048596775|    y|false|       0| null|
| 0.31735092273633597|    x|false|       0| null|
| 0.12730328020698067|    z|false|       0| null|


- Convert the n column to a integer data type and view the results. What happens?

In [35]:
#rounds to nearest into
df.select('*',df.abool.cast('int').alias('bool_int'),df.n.cast('int')).show()

+--------------------+-----+-----+--------+---+
|                   n|group|abool|bool_int|  n|
+--------------------+-----+-----+--------+---+
|  -0.712390662050588|    z|false|       0|  0|
|   0.753766378659703|    x|false|       0|  0|
|-0.04450307833805...|    z|false|       0|  0|
| 0.45181233874578974|    y|false|       0|  0|
|  1.3451017084510097|    z|false|       0|  1|
|  0.5323378882945463|    y|false|       0|  0|
|  1.3501878997225267|    z|false|       0|  1|
|  0.8612113741693206|    x|false|       0|  0|
|  1.4786857374358966|    z| true|       1|  1|
| -1.0453771305385342|    y| true|       1| -1|
| -0.7889890249515489|    x|false|       0|  0|
|  -1.261605945319069|    y|false|       0| -1|
|  0.5628467852810314|    y| true|       1|  0|
|-0.24332625188556253|    y| true|       1|  0|
|  0.9137407048596775|    y|false|       0|  0|
| 0.31735092273633597|    x|false|       0|  0|
| 0.12730328020698067|    z|false|       0|  0|
|  2.1503829673811126|    y| true|      

- Convert the abool column to a string data type and view the results. What happens?

In [39]:
df.select('*',df.abool.cast('string').alias('string_abool'),df.n.cast('int')).dtypes

[('n', 'double'),
 ('group', 'string'),
 ('abool', 'boolean'),
 ('string_abool', 'string'),
 ('n', 'int')]

In [40]:
df.select('*',df.abool.cast('string').alias('string_abool'),df.n.cast('int')).show(5)

+--------------------+-----+-----+------------+---+
|                   n|group|abool|string_abool|  n|
+--------------------+-----+-----+------------+---+
|  -0.712390662050588|    z|false|       false|  0|
|   0.753766378659703|    x|false|       false|  0|
|-0.04450307833805...|    z|false|       false|  0|
| 0.45181233874578974|    y|false|       false|  0|
|  1.3451017084510097|    z|false|       false|  1|
+--------------------+-----+-----+------------+---+
only showing top 5 rows



### 4. Built-in Functions

- Use the starter code above to re-create a spark dataframe.
- Import the necessary functions from pyspark.sql.functions
- Find the highest n value.
- Find the lowest n value.
- Find the average n value.
- Use concat to change the group column to say, e.g. "Group: x" or "Group: y"
- Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

### 5. When / Otherwise

- Use the starter code above to re-create a spark dataframe.
- Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.
- Create a column that contains 0 if n is less than 0, otherwise, the original n value.

### 6. Filter / Where

- Use the starter code above to re-create a spark dataframe.
- Use .filter or .where to select just the rows where the group is y and view the results.
- Select just the columns where the abool column is false and view the results.
- Find the columns where the group column is not y.
- Find the columns where n is positive.
- Find the columns where abool is true and the group column is z.
- Find the columns where abool is true or the group column is z.
- Find the columns where abool is false and n is less than 1
- Find the columns where abool is false or n is less than 1

### 7. Sorting

- Use the starter code above to re-create a spark dataframe.
- Sort by the n value.
- Sort by the group value, both ascending and descending.
- Sort by the group value first, then, within each group, sort by n value.
- Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?