In [3]:
filepath = '../data/movieData.csv'

## What We're Used To

Dropping columns of data in `pandas` is a pretty trivial task.

In [7]:
import pandas as pd

df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Rank,WeeklyGross,PctChangeWkGross,Theaters,DeltaTheaters,AvgRev,GrossToDate,Week,Thursday,name,year,Winner
0,17.0,967378,,14.0,,69098.0,967378,1,1990-11-18,dances with wolves,1990,True
1,9.0,3871641,300.0,14.0,,276546.0,4839019,2,1990-11-25,dances with wolves,1990,True
2,3.0,12547813,224.0,1048.0,1034.0,11973.0,17386832,3,1990-12-02,dances with wolves,1990,True
3,4.0,9246632,-26.3,1053.0,5.0,8781.0,26633464,4,1990-12-09,dances with wolves,1990,True
4,4.0,7272350,-21.4,1051.0,-2.0,6919.0,33905814,5,1990-12-16,dances with wolves,1990,True


We can either specify which columns we want to drop.

In [10]:
df.drop(['Rank', 'WeeklyGross'], axis=1).head()

Unnamed: 0,PctChangeWkGross,Theaters,DeltaTheaters,AvgRev,GrossToDate,Week,Thursday,name,year,Winner
0,,14.0,,69098.0,967378,1,1990-11-18,dances with wolves,1990,True
1,300.0,14.0,,276546.0,4839019,2,1990-11-25,dances with wolves,1990,True
2,224.0,1048.0,1034.0,11973.0,17386832,3,1990-12-02,dances with wolves,1990,True
3,-26.3,1053.0,5.0,8781.0,26633464,4,1990-12-09,dances with wolves,1990,True
4,-21.4,1051.0,-2.0,6919.0,33905814,5,1990-12-16,dances with wolves,1990,True


Or write some condition to filter on and pipe it into the `DataFrame` selector

In [38]:
wolfCount = df.apply(lambda x: x.map(str)
                                .str.contains('wolves')).sum()
colsWithWolves = wolfCount[wolfCount != 0].index

In [40]:
df[colsWithWolves].head()

Unnamed: 0,name
0,dances with wolves
1,dances with wolves
2,dances with wolves
3,dances with wolves
4,dances with wolves


Ez pz

## Now Spark

Similarly, if we want to read this in as a `Spark DataFrame`, we'd do the following.

In [None]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

spark = pyspark.sql.SparkSession(sc)

In [46]:
df = spark.read.csv(filepath, header=True)

df.show(5)

+----+-----------+----------------+--------+-------------+--------+-----------+----+----------+------------------+----+------+
|Rank|WeeklyGross|PctChangeWkGross|Theaters|DeltaTheaters|  AvgRev|GrossToDate|Week|  Thursday|              name|year|Winner|
+----+-----------+----------------+--------+-------------+--------+-----------+----+----------+------------------+----+------+
|17.0|     967378|            null|    14.0|         null| 69098.0|     967378|   1|1990-11-18|dances with wolves|1990|  True|
| 9.0|    3871641|           300.0|    14.0|         null|276546.0|    4839019|   2|1990-11-25|dances with wolves|1990|  True|
| 3.0|   12547813|           224.0|  1048.0|       1034.0| 11973.0|   17386832|   3|1990-12-02|dances with wolves|1990|  True|
| 4.0|    9246632|           -26.3|  1053.0|          5.0|  8781.0|   26633464|   4|1990-12-09|dances with wolves|1990|  True|
| 4.0|    7272350|           -21.4|  1051.0|         -2.0|  6919.0|   33905814|   5|1990-12-16|dances with wolv

But trying to drop columns is a little involved.

In [47]:
from pyspark.sql import functions as F

In [51]:
counts = df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in df.columns])

In [53]:
counts.show()

+----+-----------+----------------+--------+-------------+------+-----------+----+--------+----+----+------+
|Rank|WeeklyGross|PctChangeWkGross|Theaters|DeltaTheaters|AvgRev|GrossToDate|Week|Thursday|name|year|Winner|
+----+-----------+----------------+--------+-------------+------+-----------+----+--------+----+----+------+
|   9|          0|             220|       9|          456|     9|          0|   0|       0|   0|   0|     0|
+----+-----------+----------------+--------+-------------+------+-----------+----+--------+----+----+------+



In [70]:
df.withColumn('name', df['name'].contains('wolves')).show()

+----+-----------+----------------+--------+-------------+--------+-----------+----+----------+----+----+------+
|Rank|WeeklyGross|PctChangeWkGross|Theaters|DeltaTheaters|  AvgRev|GrossToDate|Week|  Thursday|name|year|Winner|
+----+-----------+----------------+--------+-------------+--------+-----------+----+----------+----+----+------+
|17.0|     967378|            null|    14.0|         null| 69098.0|     967378|   1|1990-11-18|true|1990|  True|
| 9.0|    3871641|           300.0|    14.0|         null|276546.0|    4839019|   2|1990-11-25|true|1990|  True|
| 3.0|   12547813|           224.0|  1048.0|       1034.0| 11973.0|   17386832|   3|1990-12-02|true|1990|  True|
| 4.0|    9246632|           -26.3|  1053.0|          5.0|  8781.0|   26633464|   4|1990-12-09|true|1990|  True|
| 4.0|    7272350|           -21.4|  1051.0|         -2.0|  6919.0|   33905814|   5|1990-12-16|true|1990|  True|
| 4.0|    6456867|           -11.2|  1196.0|        145.0|  5399.0|   40362681|   6|1990-12-23|t

In [77]:
from pyspark.sql.functions import isnan, when, count, col

counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])

In [104]:
df.select(count(col('name').contains('wolves')).alias('name')).show()

+----+
|name|
+----+
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
|true|
+----+
only showing top 20 rows

