#### Docs
* withColumn: https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.DataFrame.withColumn.html?highlight=withcolumn#pyspark.sql.DataFrame.withColumn
* current_date: https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.current_date.html?highlight=current_date
* lit: https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.lit.html

In [0]:
# saving csv path, defining schema and creating df
country_df_path = 'dbfs:/FileStore/read_write_data/countries.csv'

from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
country_df_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )

country_df = spark.read.csv(path=country_df_path, header=True, schema=country_df_schema)

country_df.limit(5).display()

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20


In [0]:
# using current_date func and withColumn to add current_date column
from pyspark.sql.functions import current_date
country_df.withColumn('CURRENT_DATE', current_date()).limit(5).display()

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID,CURRENT_DATE
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30,2023-05-03
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20,2023-05-03
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20,2023-05-03
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30,2023-05-03
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20,2023-05-03


In [0]:
# Adding a new column using literal value
from pyspark.sql.functions import lit
country_df.withColumn('UPDATED_BY', lit('Hris')).limit(5).display()

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID,UPDATED_BY
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30,Hris
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20,Hris
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20,Hris
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30,Hris
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20,Hris


In [0]:
# Adding a new column derived from using simple arithmetic operation on an existing column
from pyspark.sql.functions import round
country_df.withColumn('ROUND_POPULATION_M', round((country_df['population']/1000000), 1)).limit(5).display()

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID,ROUND_POPULATION_M
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30,38.0
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20,2.9
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20,43.1
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30,0.1
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20,0.1
