In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("local").getOrCreate()

In [2]:
data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
    ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
  ]

columns = ["firstname","lastname","country","state"]

In [3]:
df = spark.createDataFrame(data=data, schema=columns)
df.show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [4]:
df.collect()

[Row(firstname='James', lastname='Smith', country='USA', state='CA'),
 Row(firstname='Michael', lastname='Rose', country='USA', state='NY'),
 Row(firstname='Robert', lastname='Williams', country='USA', state='CA'),
 Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]

In [5]:
spark.createDataFrame(df.collect()).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [7]:
state1 = df.rdd.map(lambda x: x[3]).collect()
state1

['CA', 'NY', 'CA', 'FL']

In [11]:
df.rdd.map(lambda x : x.state).collect()

['CA', 'NY', 'CA', 'FL']

In [15]:
df.select('state').rdd.map(lambda x: x.state).collect()

['CA', 'NY', 'CA', 'FL']

In [17]:
df.select('state').toPandas()['state'].tolist()

['CA', 'NY', 'CA', 'FL']

In [43]:
from pyspark.sql.types import StringType, StructType, StructField, DateType
from pyspark.sql.functions import to_date
from pyspark.sql import Row

# Sample data with a string column representing dates
data = [("1/12/2023",), ("2/15/2023",), ("3/20/2023",)]
columns = ["date_str"]

df = spark.createDataFrame(data, columns)

# Convert the string column to a DateType
df = df.withColumn("date", to_date("date_str", "M/d/yyyy").cast(DateType()))

df.show()

+---------+----------+
| date_str|      date|
+---------+----------+
|1/12/2023|2023-01-12|
|2/15/2023|2023-02-15|
|3/20/2023|2023-03-20|
+---------+----------+



In [23]:
import pandas as pd

data = [['Scott', 50], ['Jeff', 45], ['Thomas', 54],['Ann',34]] 
  
# Create the pandas DataFrame 
pandasDF = pd.DataFrame(data, columns = ['Name', 'Age']) 

pandasDF.head()

Unnamed: 0,Name,Age
0,Scott,50
1,Jeff,45
2,Thomas,54
3,Ann,34


In [25]:
df = spark.createDataFrame(pandasDF)
df.show()

+------+---+
|  Name|Age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+



In [27]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [35]:
from pyspark.sql.types import StringType, StructField, IntegerType, DateType

schema = StructType([
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True)
])

df = spark.createDataFrame(pandasDF, schema=schema)
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [30]:
pandasDF2=df.select("*").toPandas
print(pandasDF2)

<bound method PandasConversionMixin.toPandas of DataFrame[Name: string, Age: int]>


In [31]:
test=spark.conf.get("spark.sql.execution.arrow.enabled")
print(test)

test123=spark.conf.get("spark.sql.execution.arrow.pyspark.fallback.enabled")
print(test123)

false
true


In [34]:
spark.conf.set("spark.sql.execution.arrow.enabled","false")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled","true")

pandasDF2=df.select("*").toPandas
print(pandasDF2)

<bound method PandasConversionMixin.toPandas of DataFrame[Name: string, Age: int]>


In [58]:
from pyspark.sql.functions import col,expr
data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)]

In [63]:
spark.createDataFrame(data).toDF("date","increment").select(col('date'),col('increment'),expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").alias("inc_date")).show()

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+



In [65]:
from pyspark.sql.functions import add_months, to_date, 

In [73]:
spark.createDataFrame(data,schema=["date","increment"]).select(['date','increment',add_months(to_date('date'),'increment').alias("inc_date")]).show()

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+



In [74]:
from pyspark.sql.functions import add_months, to_date
data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)]
spark.createDataFrame(data,schema=["date","increment"]).select(['date','increment',add_months(to_date('date'),'increment').alias("inc_date")]).show()

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+

