#### Changing Data Types
* data types in spark: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
* cast: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.cast.html?highlight=cast#pyspark.sql.Column.cast

In [0]:
# Reading in the countries.csv file and specifying the schema
countries_path = 'dbfs:/FileStore/read_write_data/countries.csv'
 
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
countries_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False),
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )
 
countries=spark.read.csv(path=countries_path, header=True, schema=countries_schema)

In [0]:
# using .dtypes to view data types
countries.dtypes

In [0]:
# Reading the countries file into a new variable without specifying the schema 
countries_dt = spark.read.csv(path=countries_path, header=True)

In [0]:
# All data types are all string
countries_dt.dtypes

In [0]:
# Using the cast method to cast the population column as IntegerType(), IntegerType() has already been imported in the first cell when creating the schema
countries_dt.select(countries_dt['population'].cast(IntegerType())).dtypes

In [0]:
# Using the cast method to cast the population column as StringType(), StringType() has already been imported in the first cell when creating the schema
countries.select(countries['population'].cast(StringType())).dtypes

#### Math Functions
* Math Functions: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#math-functions

In [0]:
# Simple arithmetic to return the population in milions
countries.select(countries['population']/1000000).withColumnRenamed('(population / 1000000)','population_m').limit(5).display()

# Adding the column to a variable
countries_2 = countries.select(countries['population']/1000000).withColumnRenamed('(population / 1000000)','population_m')

# Using the round function to round to 2 decimal places
from pyspark.sql.functions import round
countries_2.select(round(countries_2['population_m'],2)).withColumnRenamed('round(population_m, 2)', 'population_m').limit(5).display()

population_m
38.041754
2.880917
43.053054
0.055312
0.077142


population_m
38.04
2.88
43.05
0.06
0.08


In [0]:
# create new column population_m_r1 rounded to 1 decimal in countries df
countries.withColumn('population_m_r1', round(countries['population']/1000000, 1)).limit(5).display()

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID,population_m_r1
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30,38.0
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20,2.9
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20,43.1
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30,0.1
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20,0.1
