In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [("James",34),("Ann",34),
    ("Michael",33),("Scott",53),
    ("Robert",37),("Chad",27)
  ]

columns = ["firstname","age",]
df = spark.createDataFrame(data = simpleData, schema = columns)


df.show()
#Returns the first ``num`` rows as a :class:`list` of :class:`Row`.
# Internally calls limit and collect
#Action, Return Array[T]
print(df.take(2))

#Returns the last ``num`` rows as a :class:`list` of :class:`Row`.
#Running tail requires moving data into the application's driver process, and doing so with
#a very large ``num`` can crash the driver process with OutOfMemoryError.
#Return Array[T]
print(df.tail(2))


"""Returns the first ``n`` rows.

.. note:: This method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.

:param n: int, default 1. Number of rows to return.
:return: If n is greater than 1, return a list of :class:`Row`.
    If n is 1, return a single Row."""
#Return Array[T]
print(df.head(2))


#Returns the first row, same as df.head(1)
print(df.first())

#Returns all the records as a list of :class:`Row`.
#Action, Return Array[T]
print(df.collect())
#"Limits the result count to the number specified.
#Returns a new Dataset by taking the first n rows.
pandasDF=df.limit(3).toPandas()
print(pandasDF)


+---------+---+
|firstname|age|
+---------+---+
|    James| 34|
|      Ann| 34|
|  Michael| 33|
|    Scott| 53|
|   Robert| 37|
|     Chad| 27|
+---------+---+

[Row(firstname='James', age=34), Row(firstname='Ann', age=34)]
[Row(firstname='Robert', age=37), Row(firstname='Chad', age=27)]
[Row(firstname='James', age=34), Row(firstname='Ann', age=34)]
Row(firstname='James', age=34)
[Row(firstname='James', age=34), Row(firstname='Ann', age=34), Row(firstname='Michael', age=33), Row(firstname='Scott', age=53), Row(firstname='Robert', age=37), Row(firstname='Chad', age=27)]
  firstname  age
0     James   34
1       Ann   34
2   Michael   33


In [0]:
#take(n): Returns the first n rows of the DataFrame as a list of Rows. It internally calls the limit() and collect() methods.

#tail(n): Returns the last n rows of the DataFrame as a list of Rows. Retrieving the tail requires moving data into the driver's process, and retrieving a large number of rows can potentially cause memory issues.

#head(n): Returns the first n rows of the DataFrame. If n is greater than 1, it returns a list of Rows. If n is 1, it returns a single Row.

#first(): Returns the first row of the DataFrame as a Row object.

#collect(): Returns all the rows of the DataFrame as a list of Rows. This method should be used cautiously as it loads all the data into the driver's memory, and it may cause memory issues if the DataFrame is large.

#limit(n): Limits the result count to the specified number n and returns a new DataFrame with the first n rows.

#toPandas(): Converts the DataFrame to a Pandas DataFrame. This method can be used to retrieve all the rows of the DataFrame as a Pandas DataFrame.