In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(conf=SparkConf())

In [3]:
spark = SparkSession(sparkContext=sc)

# Example datasets

In [4]:
mtcars = spark.read.csv('data/mtcars.csv', inferSchema=True, header=True)

In [5]:
mtcars = mtcars.withColumnRenamed('_c0', 'model')

In [6]:
mtcars.show(5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



# Select Rows

## Select Rows by index

**Add index to each row.**

In [7]:
mtcars.rdd.zipWithIndex().take(3)

[(Row(model='Mazda RX4', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.62, qsec=16.46, vs=0, am=1, gear=4, carb=4),
  0),
 (Row(model='Mazda RX4 Wag', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.875, qsec=17.02, vs=0, am=1, gear=4, carb=4),
  1),
 (Row(model='Datsun 710', mpg=22.8, cyl=4, disp=108.0, hp=93, drat=3.85, wt=2.32, qsec=18.61, vs=1, am=1, gear=4, carb=1),
  2)]

Each row of the RDD has two elements. The first is a Row object, the second is the corresponding index value. Let's merge the index value to the Row object for each row.

In [10]:
from pyspark.sql import Row

In [11]:
mtcars.rdd.zipWithIndex().map(lambda x: x[0] + Row(index=x[1])).take(5)

[('Mazda RX4', 21.0, 6, 160.0, 110, 3.9, 2.62, 16.46, 0, 1, 4, 4, 0),
 ('Mazda RX4 Wag', 21.0, 6, 160.0, 110, 3.9, 2.875, 17.02, 0, 1, 4, 4, 1),
 ('Datsun 710', 22.8, 4, 108.0, 93, 3.85, 2.32, 18.61, 1, 1, 4, 1, 2),
 ('Hornet 4 Drive', 21.4, 6, 258.0, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1, 3),
 ('Hornet Sportabout', 18.7, 8, 360.0, 175, 3.15, 3.44, 17.02, 0, 0, 3, 2, 4)]

Now convert the RDD back to DataFrame and add colnames to each column.

In [12]:
col_names = mtcars.columns + ['index']
col_names

['model',
 'mpg',
 'cyl',
 'disp',
 'hp',
 'drat',
 'wt',
 'qsec',
 'vs',
 'am',
 'gear',
 'carb',
 'index']

In [13]:
mtcars.rdd.zipWithIndex().map(lambda x: x[0] + Row(index=x[1])).toDF(col_names).show()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|index|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|    0|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|    1|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|    2|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|    3|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|    4|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|    5|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|    6|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|    7|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|    8|
|           Merc

Now we can filter the DataFrame by the **index** column

In [14]:
mtcars_new = mtcars.rdd.zipWithIndex().map(lambda x: x[0] + Row(index=x[1])).toDF(col_names)

**Subset by specific rows**

In [15]:
mtcars_new.filter(mtcars_new.index.isin(1,2,3,8)).show()

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|index|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
| Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|    1|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|    2|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|    3|
|      Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|    8|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+



**Subset by a range**

In [16]:
mtcars_new.filter(mtcars_new.index.between(2, 8)).show()

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|index|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|    2|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|    3|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|    4|
|          Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|    5|
|       Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|    6|
|        Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|    7|
|         Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|    8|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+



**Subset by smaller or larger than an index**

In [17]:
mtcars_new.filter(mtcars_new.index < 4).show()

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|index|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|     Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|    0|
| Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|    1|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|    2|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|    3|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+



In [18]:
mtcars_new.filter(mtcars_new.index > 26).show()

+--------------+----+---+-----+---+----+-----+----+---+---+----+----+-----+
|         model| mpg|cyl| disp| hp|drat|   wt|qsec| vs| am|gear|carb|index|
+--------------+----+---+-----+---+----+-----+----+---+---+----+----+-----+
|  Lotus Europa|30.4|  4| 95.1|113|3.77|1.513|16.9|  1|  1|   5|   2|   27|
|Ford Pantera L|15.8|  8|351.0|264|4.22| 3.17|14.5|  0|  1|   5|   4|   28|
|  Ferrari Dino|19.7|  6|145.0|175|3.62| 2.77|15.5|  0|  1|   5|   6|   29|
| Maserati Bora|15.0|  8|301.0|335|3.54| 3.57|14.6|  0|  1|   5|   8|   30|
|    Volvo 142E|21.4|  4|121.0|109|4.11| 2.78|18.6|  1|  1|   4|   2|   31|
+--------------+----+---+-----+---+----+-----+----+---+---+----+----+-----+



## Select Rows by logical criteria

**cyl = 4**

In [19]:
mtcars_new.filter(mtcars_new.cyl == 4).show()

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|index|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----+
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|    2|
|     Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|    7|
|      Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|    8|
|      Fiat 128|32.4|  4| 78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|   17|
|   Honda Civic|30.4|  4| 75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|   18|
|Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|   19|
| Toyota Corona|21.5|  4|120.1| 97| 3.7|2.465|20.01|  1|  0|   3|   1|   20|
|     Fiat X1-9|27.3|  4| 79.0| 66|4.08|1.935| 18.9|  1|  1|   4|   1|   25|
| Porsche 914-2|26.0|  4|120.3| 91|4.43| 2.14| 16.7|  0|  1|   5|   2|   26|
|  Lotus Europa|30.4|  4| 95.1|113|3.77|1.513| 16.9|  1|  1|   5|   2|   27|

**vs = 1 and am = 1**

In [20]:
from pyspark.sql import functions as F

We add a new 'filter' column to our original DataFrame. The new 'filter' column is built with the requested conditions. Then we filter our DataFrame by that column.

**First, we need to create a list of *Column objects* that we can pass to the `select()` function.**

In [21]:
original_columns_all = [eval('mtcars.' + x) for x in mtcars.columns]
original_columns_all

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>]

**Second, build the 'filter' column.**

<span style="color:red">Warning: when passing multiple conditions to the **`when()`** function, each condition has to be within a pair of parentheses</span>

In [22]:
filter_column = F.when((mtcars.vs == 1) & (mtcars.am == 1), 1).name('filter')

**The columns we will select include:**

In [23]:
original_columns_all + [filter_column]

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>,
 Column<b'CASE WHEN ((vs = 1) AND (am = 1)) THEN 1 END AS `filter`'>]

Selecting these columns will give us:

In [24]:
mtcars_filter = mtcars.select(original_columns_all + [filter_column])
mtcars_filter.show(40)

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|filter|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|  null|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|  null|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|     1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|  null|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|  null|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|  null|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|  null|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|  null|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|  null|
|   

Now we can select rows by **vs = 1 and am = 1**

In [25]:
mtcars_filter.filter(mtcars_filter['filter'] == 1).show(40)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|filter|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|     1|
|      Fiat 128|32.4|  4| 78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|     1|
|   Honda Civic|30.4|  4| 75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|     1|
|Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|     1|
|     Fiat X1-9|27.3|  4| 79.0| 66|4.08|1.935| 18.9|  1|  1|   4|   1|     1|
|  Lotus Europa|30.4|  4| 95.1|113|3.77|1.513| 16.9|  1|  1|   5|   2|     1|
|    Volvo 142E|21.4|  4|121.0|109|4.11| 2.78| 18.6|  1|  1|   4|   2|     1|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+



And we can drop the 'filter' we previously added

In [26]:
mtcars_filter.filter(mtcars_filter['filter'] == 1).drop('filter').show(40)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|      Fiat 128|32.4|  4| 78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|
|   Honda Civic|30.4|  4| 75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|
|Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|
|     Fiat X1-9|27.3|  4| 79.0| 66|4.08|1.935| 18.9|  1|  1|   4|   1|
|  Lotus Europa|30.4|  4| 95.1|113|3.77|1.513| 16.9|  1|  1|   5|   2|
|    Volvo 142E|21.4|  4|121.0|109|4.11| 2.78| 18.6|  1|  1|   4|   2|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+



Let's try a more complicated example: **cyl = 4 OR ( vs = 0 AND am = 0)**

In [27]:
original_columns_all

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>]

Build filter column.

<span style="color:red">Warning: when passing multiple conditions to the **`when()`** function, each condition has to be within a pair of parentheses</span>.

In [28]:
filter_column_2 = F.when((mtcars.cyl == 4) | ( (mtcars.vs == 0) & (mtcars.am == 0) ), 1).name('filter')
filter_column_2

Column<b'CASE WHEN ((cyl = 4) OR ((vs = 0) AND (am = 0))) THEN 1 END AS `filter`'>

In [29]:
mtcars_filter_2 = mtcars.select(original_columns_all + [filter_column_2])
mtcars_filter_2.show(40)

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|filter|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|  null|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|  null|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|     1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|  null|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|     1|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|  null|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|     1|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|     1|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|     1|
|   

In [30]:
mtcars_filter_2.filter(mtcars_filter_2['filter'] == 1).drop('filter').show()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|         Merc 450SE|16.4|  8|275.8|180|3.07| 4.07| 17.4|  0|  0|   3|   3|
|         Merc 450SL|17.3|  8|275.8|180|3.07| 3.73| 17.6|  0|  0|   3|   3|
|        Merc 450SLC|15.2|  8|275.8|180|3.07| 3.78| 18.0|  0|  0|   3|   3|
| Cadillac Fleetwood|10.4|  8|472.0|205|2.93| 5.25|17.98|  0|  0|   3|   4|
|Lincoln Continental|10.4|  8|460.0|215| 3.0|5.424|17.82|  0|  0|   3|   4|
|  Chrysler 

# Select Columns

## Select columns by names

Selecting columns by names is very simple. Just use the `select()` function.

Example: select columns **['model', 'mpg', 'cyl', 'carb']**

In [31]:
mtcars.select([eval('mtcars.' + x) for x in ['model', 'mpg', 'cyl', 'carb']]).show()

+-------------------+----+---+----+
|              model| mpg|cyl|carb|
+-------------------+----+---+----+
|          Mazda RX4|21.0|  6|   4|
|      Mazda RX4 Wag|21.0|  6|   4|
|         Datsun 710|22.8|  4|   1|
|     Hornet 4 Drive|21.4|  6|   1|
|  Hornet Sportabout|18.7|  8|   2|
|            Valiant|18.1|  6|   1|
|         Duster 360|14.3|  8|   4|
|          Merc 240D|24.4|  4|   2|
|           Merc 230|22.8|  4|   2|
|           Merc 280|19.2|  6|   4|
|          Merc 280C|17.8|  6|   4|
|         Merc 450SE|16.4|  8|   3|
|         Merc 450SL|17.3|  8|   3|
|        Merc 450SLC|15.2|  8|   3|
| Cadillac Fleetwood|10.4|  8|   4|
|Lincoln Continental|10.4|  8|   4|
|  Chrysler Imperial|14.7|  8|   4|
|           Fiat 128|32.4|  4|   1|
|        Honda Civic|30.4|  4|   2|
|     Toyota Corolla|33.9|  4|   1|
+-------------------+----+---+----+
only showing top 20 rows



## Select columns by index

Example: select columns **[0, 1, 3, 5, 6, 7, 8]**

In [32]:
column_indices = [0, 1, 3] + list(range(5, 9))
column_indices

[0, 1, 3, 5, 6, 7, 8]

In [33]:
[mtcars.columns[x] for x in column_indices]

['model', 'mpg', 'disp', 'drat', 'wt', 'qsec', 'vs']

In [34]:
mtcars.select([eval('mtcars.' + x) for x in [mtcars.columns[x] for x in column_indices]]).show()

+-------------------+----+-----+----+-----+-----+---+
|              model| mpg| disp|drat|   wt| qsec| vs|
+-------------------+----+-----+----+-----+-----+---+
|          Mazda RX4|21.0|160.0| 3.9| 2.62|16.46|  0|
|      Mazda RX4 Wag|21.0|160.0| 3.9|2.875|17.02|  0|
|         Datsun 710|22.8|108.0|3.85| 2.32|18.61|  1|
|     Hornet 4 Drive|21.4|258.0|3.08|3.215|19.44|  1|
|  Hornet Sportabout|18.7|360.0|3.15| 3.44|17.02|  0|
|            Valiant|18.1|225.0|2.76| 3.46|20.22|  1|
|         Duster 360|14.3|360.0|3.21| 3.57|15.84|  0|
|          Merc 240D|24.4|146.7|3.69| 3.19| 20.0|  1|
|           Merc 230|22.8|140.8|3.92| 3.15| 22.9|  1|
|           Merc 280|19.2|167.6|3.92| 3.44| 18.3|  1|
|          Merc 280C|17.8|167.6|3.92| 3.44| 18.9|  1|
|         Merc 450SE|16.4|275.8|3.07| 4.07| 17.4|  0|
|         Merc 450SL|17.3|275.8|3.07| 3.73| 17.6|  0|
|        Merc 450SLC|15.2|275.8|3.07| 3.78| 18.0|  0|
| Cadillac Fleetwood|10.4|472.0|2.93| 5.25|17.98|  0|
|Lincoln Continental|10.4|46

## Select by columns names that match a regex pattern

Example: columns start with 'd'.

In [35]:
import re
selected_column = [x for x in mtcars.columns if re.compile('^d').match(x) is not None]
selected_column

['disp', 'drat']

In [36]:
[eval('mtcars.' + x) for x in selected_column]

[Column<b'disp'>, Column<b'drat'>]

In [37]:
mtcars.select([eval('mtcars.' + x) for x in selected_column]).show()

+-----+----+
| disp|drat|
+-----+----+
|160.0| 3.9|
|160.0| 3.9|
|108.0|3.85|
|258.0|3.08|
|360.0|3.15|
|225.0|2.76|
|360.0|3.21|
|146.7|3.69|
|140.8|3.92|
|167.6|3.92|
|167.6|3.92|
|275.8|3.07|
|275.8|3.07|
|275.8|3.07|
|472.0|2.93|
|460.0| 3.0|
|440.0|3.23|
| 78.7|4.08|
| 75.7|4.93|
| 71.1|4.22|
+-----+----+
only showing top 20 rows

