In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder \
    .appName("Pandas vs PySpark DataFrame") \
    .getOrCreate()


In [0]:
data = [
    (1, "Mumbai Indians", 170),
    (2, "Delhi Capitals", 150),
    (3, "Royal Challengers Bangalore", 180),
    (4, "Kolkata Knight Riders", 160)
]

In [0]:
columns = ["match_id", "team_name", "score"]

In [0]:
df_spark = spark.createDataFrame(data, columns)

In [0]:
df_spark.show()

+--------+--------------------+-----+
|match_id|           team_name|score|
+--------+--------------------+-----+
|       1|      Mumbai Indians|  170|
|       2|      Delhi Capitals|  150|
|       3|Royal Challengers...|  180|
|       4|Kolkata Knight Ri...|  160|
+--------+--------------------+-----+



In [0]:
print("Explanation (Simple):")
df_spark.explain(extended=False) 

Explanation (Simple):
== Physical Plan ==
*(1) Scan ExistingRDD[match_id#150L,team_name#151,score#152L]




In [0]:
print("\nExplanation (Extended):")
df_spark.explain(extended=True) 


Explanation (Extended):
== Parsed Logical Plan ==
LogicalRDD [match_id#150L, team_name#151, score#152L], false

== Analyzed Logical Plan ==
match_id: bigint, team_name: string, score: bigint
LogicalRDD [match_id#150L, team_name#151, score#152L], false

== Optimized Logical Plan ==
LogicalRDD [match_id#150L, team_name#151, score#152L], false

== Physical Plan ==
*(1) Scan ExistingRDD[match_id#150L,team_name#151,score#152L]



In [0]:
print("\nExplanation (Codegen):")
df_spark.explain(mode="codegen") 


Explanation (Codegen):
Found 1 WholeStageCodegen subtrees.
== Subtree 1 / 1 (maxMethodCodeSize:252; maxConstantPoolSize:115(0.18% used); numInnerClasses:0) ==
*(1) Scan ExistingRDD[match_id#150L,team_name#151,score#152L]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator rdd_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] rdd_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage1(Object[] references) {
/* 013 */     this.references = refe

In [0]:
print("\nExplanation (Cost):")
df_spark.explain(mode="cost") 


Explanation (Cost):
== Optimized Logical Plan ==
LogicalRDD [match_id#150L, team_name#151, score#152L], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
*(1) Scan ExistingRDD[match_id#150L,team_name#151,score#152L]




In [0]:
print("\nExplanation (Formatted):")
df_spark.explain(mode="formatted")  


Explanation (Formatted):
== Physical Plan ==
* Scan ExistingRDD (1)


(1) Scan ExistingRDD [codegen id : 1]
Output [3]: [match_id#150L, team_name#151, score#152L]
Arguments: [match_id#150L, team_name#151, score#152L], MapPartitionsRDD[12] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)




In [0]:
import pandas as pd

In [0]:
df_pandas = pd.DataFrame(data, columns=columns)


In [0]:
print("\nPandas DataFrame:")
print(df_pandas)


Pandas DataFrame:
   match_id                    team_name  score
0         1               Mumbai Indians    170
1         2               Delhi Capitals    150
2         3  Royal Challengers Bangalore    180
3         4        Kolkata Knight Riders    160
