#### Resources for selecting an renaming columns in pyspark
* select: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.select.html?highlight=select#pyspark.sql.DataFrame.select
* col: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.col.html?highlight=col#pyspark.sql.functions.col
* alias: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.alias.html?highlight=alias#pyspark.sql.Column.alias
* withColumnRenamed: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnRenamed.html?highlight=withcolumnrenamed#pyspark.sql.DataFrame.withColumnRenamed

In [0]:
# saving countries.csv path as a var for reuse
country_csv = 'dbfs:/FileStore/read_write_data/countries.csv'

# importing data types and type + field to create a schema
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType
country_schema = StructType([
                    StructField("COUNTRY_ID", IntegerType(), False), # False or true is for isNullable type
                    StructField("NAME", StringType(), False),
                    StructField("NATIONALITY", StringType(), False),
                    StructField("COUNTRY_CODE", StringType(), False),
                    StructField("ISO_ALPHA2", StringType(), False),
                    StructField("CAPITAL", StringType(), False),
                    StructField("POPULATION", DoubleType(), False),
                    StructField("AREA_KM2", IntegerType(), False),
                    StructField("REGION_ID", IntegerType(), True),
                    StructField("SUB_REGION_ID", IntegerType(), True),
                    StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
                    StructField("ORGANIZATION_REGION_ID", IntegerType(), True)
                    ]
                    )

# Reading in the countries csv file as a Dataframe
countries_df = spark.read.csv(path=country_csv, header = True, schema=country_schema)

In [0]:
#display df limited to 5 rows
display(countries_df.limit(5))

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20


In [0]:
# select columns by using the select method and specifying the column names
countries_df.select('name', 'capital', 'population').limit(5).display()

name,capital,population
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
#can also provide the columns by specifically referring to the dataframe and passing the column name inside of the square brackets, this allows you to perform additonal methods on the columns themselves
countries_df.select(countries_df['name'], countries_df['capital'], countries_df['population']).limit(5).display()

name,capital,population
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
#can also provide the columns by qualifying the column name with the Dataframe, this allows you to perform additonal methods on the columns, but columns referencing has to be upper cased [this is because the original name of the columns is in UPPER CASE in the stryct fields we defined in the struct type/ schema]
countries_df.select(countries_df.NAME, countries_df.CAPITAL, countries_df.POPULATION).limit(5).display()

NAME,CAPITAL,POPULATION
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
# Importing the col function [col() takes methods applied on it as well]
from pyspark.sql.functions import col

countries_df.select(col('name'), col('capital'), col('population')).limit(5).display()

name,capital,population
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
# The alias method allows you to rename the columns only for the current output purposes. this is not changing the original schema / struct field name
countries_df.select(
    countries_df['name'].alias('country_name'), 
    countries_df['capital'].alias('capital_city'), 
    countries_df['population'].alias('country_population')
).limit(5).display()

country_name,capital_city,country_population
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
# withColumnRenamed allows you to rename a column
countries_df.select(
    'name', 
    'capital', 
    'population'
).withColumnRenamed(
    'name', 
    'countryName'
).withColumnRenamed(
    'capital', 
    'capitalName'
).withColumnRenamed(
    'population', 
    'populationCount'
).limit(5).display()



countryName,capitalName,populationCount
Afghanistan,Kabul,38041754.0
Albania,Tirana,2880917.0
Algeria,Algiers,43053054.0
American Samoa,Pago Pago,55312.0
Andorra,Andorra la Vella,77142.0


In [0]:
# Reading in the regions.csv file and assiging it to a variable
regions_path = 'dbfs:/FileStore/read_write_data/country_regions.csv'

regions_schema = StructType([
    StructField("COUNTRY_ID", IntegerType(), False), # False or true is for isNullable type
    StructField("NAME", StringType(), False)
])

regions = spark.read.csv(path=regions_path, header=True, schema=regions_schema)

In [0]:
regions.display()

COUNTRY_ID,NAME
10,America
20,Europe
30,Asia
40,Oceania
50,Africa


In [0]:
# aliasing the columns names to 'continent' with alias
regions.select(
    regions['country_id'].alias('continent_id'),
    regions['name'].alias('continent_name')
).display()

continent_id,continent_name
10,America
20,Europe
30,Asia
40,Oceania
50,Africa


In [0]:
# aliasing columns names to 'continent' with withColumnRenamed
regions.select(
    'country_id',
    'name'
).withColumnRenamed(
    'country_id',
    'continent_id'
).withColumnRenamed(
    'name',
    'continent_name'
).display()

continent_id,continent_name
10,America
20,Europe
30,Asia
40,Oceania
50,Africa
