In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
    ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
  ]
columns=["firstname","lastname","country","state"]
df=spark.createDataFrame(data=data,schema=columns)
df.show()
print(df.collect())

states1=df.rdd.map(lambda x: x[3]).collect()
print(states1)
#['CA', 'NY', 'CA', 'FL']
from collections import OrderedDict 
res = list(OrderedDict.fromkeys(states1)) 
print(res)
#['CA', 'NY', 'FL']


#Example 2
states2=df.rdd.map(lambda x: x.state).collect()
print(states2)
#['CA', 'NY', 'CA', 'FL']

states3=df.select(df.state).collect()
print(states3)
#[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')]

states4=df.select(df.state).rdd.flatMap(lambda x: x).collect()
print(states4)
#['CA', 'NY', 'CA', 'FL']

states5=df.select(df.state).toPandas()['state']
states6=list(states5)
print(states6)
#['CA', 'NY', 'CA', 'FL']

pandDF=df.select(df.state,df.firstname).toPandas()
print(list(pandDF['state']))
print(list(pandDF['firstname']))

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

[Row(firstname='James', lastname='Smith', country='USA', state='CA'), Row(firstname='Michael', lastname='Rose', country='USA', state='NY'), Row(firstname='Robert', lastname='Williams', country='USA', state='CA'), Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]
['CA', 'NY', 'CA', 'FL']
['CA', 'NY', 'FL']
['CA', 'NY', 'CA', 'FL']
[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')]
['CA', 'NY', 'CA', 'FL']
['CA', 'NY', 'CA', 'FL']
['CA', 'NY', 'CA', 'FL']
['James', 'Michael', 'Robert', 'Maria']


In [0]:
#states1=df.rdd.map(lambda x: x[3]).collect()            #to convert into list by using RDD property
#res = list(OrderedDict.fromkeys(states1))              #to remove duplicates and convert back to list
#states3=df.select(df.state).collect()                      #new data frame that contains only state column
#res = list(OrderedDict.fromkeys(states1))             #to remove duplicates and convert back to list
#states3=df.select(df.state).collect()                      #new data frame that contains only state column
#states5=df.select(df.state).toPandas()['state']      #to convert Spark DF to Pandas DF and state returns a pandas series


In [0]:
#The code begins by importing the necessary libraries and creating a SparkSession with the specified configuration.

#The data list contains tuples representing rows of data, and the columns list defines the column names.

#The DataFrame df is created using spark.createDataFrame(data=data, schema=columns).

#The show() method is called on the DataFrame to display its contents.

#The collect() method is called on the DataFrame to retrieve all the rows as a list of Row objects, and it is printed.

#The first approach to extract the "state" column is by using RDD operations. The map() transformation is applied to the RDD to extract the values of the "state" column, and the collect() action retrieves the values as a list. The resulting list is printed.

#The second approach is similar to the first one, but instead of using the RDD operations, the DataFrame column is directly accessed using the dot notation (x.state). The resulting list is printed.

#The third approach uses the select() method to select the "state" column from the DataFrame. The collect() action retrieves the rows as a list of Row objects, and the resulting list is printed.

#The fourth approach is similar to the third one, but it flattens the list of Row objects using the flatMap() transformation. The resulting list of states is printed.

#The fifth approach converts the "state" column of the DataFrame into a Pandas DataFrame column using toPandas(). The resulting Pandas DataFrame is accessed using dictionary-like notation (pandDF['state']), and the column values are converted to a list and printed.

#The sixth approach combines the fifth approach with converting the Pandas DataFrame column to a Python list directly using list(states5).

#The last part of the code demonstrates the extraction of multiple columns into a Pandas DataFrame. The selected columns are converted to a Pandas DataFrame using toPandas(), and then individual columns are accessed using dictionary-like notation (pandDF['state'], pandDF['firstname']). The values of each column are converted to lists and printed.

#Overall, the code showcases various methods to extract specific columns from a PySpark DataFrame and convert them into Python lists or Pandas DataFrame columns. These approaches provide flexibility in working with specific columns or subsets of data in different formats.
