In [0]:
#create rdd
dept=[("mrketing",40),("promotion",50)]
rdd=spark.sparkContext.parallelize(dept)

In [0]:
#converting rdd to dataframe
df=rdd.toDF()

In [0]:
print(df.printSchema())

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

None


In [0]:
cols=['sector','services']
df=rdd.toDF(cols)

In [0]:
df.printSchema()

root
 |-- sector: string (nullable = true)
 |-- services: long (nullable = true)



In [0]:
df.show()

+---------+--------+
|   sector|services|
+---------+--------+
| mrketing|      40|
|promotion|      50|
+---------+--------+



In [0]:
#convert data frame to pandas
df_pandas=df.toPandas()

In [0]:
print(df_pandas)

      sector  services
0   mrketing        40
1  promotion        50


In [0]:
df.show()

+---------+--------+
|   sector|services|
+---------+--------+
| mrketing|      40|
|promotion|      50|
+---------+--------+



In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,FloatType

In [0]:
data=[(1,"satya","komati",86.8),
      (2,"adhi","komati",90.3)]

In [0]:
schema=StructType([
                   StructField("id",IntegerType(),True),
                   StructField("FirstName",StringType(),True),
                   StructField("LastName",StringType(),True),
                   StructField("percentage",FloatType(),True)
                ])

In [0]:
df=spark.createDataFrame(data,schema)

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- percentage: float (nullable = true)



In [0]:
df.show()

+---+---------+--------+----------+
| id|FirstName|LastName|percentage|
+---+---------+--------+----------+
|  1|    satya|  komati|      86.8|
|  2|     adhi|  komati|      90.3|
+---+---------+--------+----------+



In [0]:
from pyspark.sql.functions import col, when
updated_df = df.withColumn("Eligibility", when(col("percentage").cast(IntegerType()) < 75, "No").otherwise("Yes"))

In [0]:
updated_df.display()

id,FirstName,LastName,percentage,Eligibility
1,satya,komati,86.8,Yes
2,adhi,komati,90.3,Yes


In [0]:
updated_df.collect()

Out[132]: [Row(id=1, FirstName='satya', LastName='komati', percentage=86.80000305175781, Eligibility='Yes'),
 Row(id=2, FirstName='adhi', LastName='komati', percentage=90.30000305175781, Eligibility='Yes')]

In [0]:
for row in updated_df.collect():
    print(row["FirstName"])

satya
adhi


In [0]:
from pyspark.sql import Row

In [0]:
data=[Row(name="james",prop=Row(hair='black',eye="blue")),
      Row(name="Ann",prop=Row(hair='grey',eye='black'))]

In [0]:
df1=spark.createDataFrame(data)

In [0]:
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)



In [0]:
df1.select(col("prop.*")).show()

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



In [0]:
df1.collect()

Out[139]: [Row(name='james', prop=Row(hair='black', eye='blue')),
 Row(name='Ann', prop=Row(hair='grey', eye='black'))]

In [0]:
df1.filter(df1.name.between('A','y')).show()

+-----+-------------+
| name|         prop|
+-----+-------------+
|james|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



In [0]:
df.filter(df.percentage.contains(90)).show()

+---+---------+--------+----------+
| id|FirstName|LastName|percentage|
+---+---------+--------+----------+
|  2|     adhi|  komati|      90.3|
+---+---------+--------+----------+



In [0]:
df1.filter(df1.name.startswith('A')).show()

+----+-------------+
|name|         prop|
+----+-------------+
| Ann|{grey, black}|
+----+-------------+



In [0]:
df1.filter(df1.name.endswith('s')).show()

+-----+-------------+
| name|         prop|
+-----+-------------+
|james|{black, blue}|
+-----+-------------+



In [0]:
df1.select(df1.name).filter(df1.name.like("A%%")).show()

+----+
|name|
+----+
| Ann|
+----+



In [0]:
df.select(df.FirstName,when(df.percentage>75,"Eligible").otherwise("Not eligible").alias("Eligibility")).show()

+---------+-----------+
|FirstName|Eligibility|
+---------+-----------+
|    satya|   Eligible|
|     adhi|   Eligible|
+---------+-----------+



In [0]:
list1=[5,2]
df.select(df.FirstName).filter(df.id.isin(list1)).show()

+---------+
|FirstName|
+---------+
|     adhi|
+---------+



In [0]:
df1.select(df1.prop.getField("hair")).show()

+---------+
|prop.hair|
+---------+
|    black|
|     grey|
+---------+



In [0]:
df1.select(df1.prop.getField("eye")).show()

+--------+
|prop.eye|
+--------+
|    blue|
|   black|
+--------+



In [0]:
df1.select(df1.prop.getItem("eye")).show()

+--------+
|prop.eye|
+--------+
|    blue|
|   black|
+--------+



In [0]:
df1 = df1.withColumnRenamed("prop","properties")

In [0]:
df1.show()

+-----+-------------+
| name|   properties|
+-----+-------------+
|james|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,LongType
schema=StructType([StructField("name",StringType(),True),
                  StructField("dob_year",StringType(),True),
                  StructField("gender",StringType(),True),
                  StructField("salary",LongType(),True)])

In [0]:
data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]
columns=['name','dob_year','gender','salary']
df=spark.createDataFrame(data,columns)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df.show()

+--------------------+--------+------+------+
|                name|dob_year|gender|salary|
+--------------------+--------+------+------+
|     James, A, Smith|    2018|     M|  3000|
|Michael, Rose, Jones|    2010|     M|  4000|
|   Robert,K,Williams|    2010|     M|  4000|
|    Maria,Anne,Jones|    2005|     F|  4000|
|      Jen,Mary,Brown|    2010|      |    -1|
+--------------------+--------+------+------+



In [0]:
remote_table = (spark.read
  .format("postgresql")
  .option("dbtable", "sales") # if schema_name not provided, default to "public".
  .option("host", "127.0.0.1")
  .option("port", "5432") # Optional - will use default port 5432 if not specified.
  .option("database", "Sales_Product")
  .option("user", "postgres")
  .option("password", "Sbksatya@123")
  .load()
)

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-4312650081617516>:1[0m
[0;32m----> 1[0m remote_table [38;5;241m=[39m ([43mspark[49m[38;5;241;43m.[39;49m[43mread[49m
[1;32m      2[0m [43m  [49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mpostgresql[39;49m[38;5;124;43m"[39;49m[43m)[49m
[1;32m      3[0m [43m  [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mdbtable[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mpublic.sales[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m [49m[38;5;66;43;03m# if schema_name not provided, default to "public".[39;49;00m
[1;32m      4[0m [43m  [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mhost[39;49m[38;5;124;43m"[

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2070313248085852>:1[0m
[0;32m----> 1[0m df [38;5;241m=[39m [43mspark[49m[38;5;241;43m.[39;49m[43mread[49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mjdbc[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      2[0m [43m    [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43murl[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mjdbc:postgresql://localhost:5432/Sales_Product[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m\[49m
[1;32m      3[0m [43m    [49m[38;5;241;43m.[39;49m[43moption[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43muser[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mpostgres[39;49m