# Loading from Excel after creating a volume folder

Remember that to create a volume folder to store files and get their file path, we should follow the following instructions:
1. Go to Catalog>Add data (+) or in the sidebar go to Data Engineering>Data Ingestion> Files> Upload files to a volume> Create volume inside your catalog:
![](path)

In [0]:
%pip install openpyxl
import pandas as pd
import openpyxl
df = pd.read_excel("/Volumes/workspace/test_schema/test_volume_folder/ufo_sighting_data.xlsx", engine='openpyxl')
sdf = spark.createDataFrame(df)
display(sdf)

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit,4/27/2004,29.8830556,-97.9411111
10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,"1949 Lackland AFB, TX. Lights racing across the sky & making 90 degree turns on a dime.",12/16/2005,29.38421,-98.581082
10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,"Green/Orange circular disc over Chester, England",1/21/2008,53.2,-2.916667
10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,"My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I took a different route home",1/17/2004,28.9783333,-96.6458333
10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,"AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise, I was at 50ꯠ' in a ""clean"" aircraft (no ordinan",1/22/2004,21.4180556,-157.8036111
10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he's still livi,4/27/2007,36.595,-82.1888889
10/10/1965 21:00,penarth (uk/wales),,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above me for 3 mins slowly moved of and then with the blink of the eye the speed was unreal,2/14/2006,51.434722,-3.18
10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish color disk/saucer was observed hovering above power transmission lines.,10/2/1999,41.1175,-73.4083333
10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,"Strobe Lighted disk shape object observed close, at low speeds, and low altitude in Oct 1966 in Pell City Alabama",3/19/2009,33.5861111,-86.2861111
10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregnant mother receives mental signals not to pass info,5/11/2005,30.2947222,-82.9841667


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DoubleType
import pandas as pd

schema = StructType([
    StructField("Date_time", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state/province", StringType(), True),
    StructField("country", StringType(), True),
    StructField("UFO_shape", StringType(), True),
    StructField("length_of_encounter_seconds", FloatType(), True),
    StructField("described_duration_of_encounter", StringType(), True),
    StructField("description", StringType(), True),
    StructField("date_documented", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

# Read CSV with proper handling of mixed types
df_csv = pd.read_csv("/Volumes/workspace/test_schema/test_volume_folder/ufo_sighting_data.csv", 
                     low_memory=False, 
                     dtype=str)  # Read everything as string first

# Convert problematic columns with error handling
df_csv['length_of_encounter_seconds'] = pd.to_numeric(df_csv['length_of_encounter_seconds'], errors='coerce')
df_csv['latitude'] = pd.to_numeric(df_csv['latitude'], errors='coerce')
df_csv['longitude'] = pd.to_numeric(df_csv['longitude'], errors='coerce')

# Handle NaN values that cause Arrow conversion issues
df_csv = df_csv.fillna({
    'length_of_encounter_seconds': 0.0,
    'latitude': 0.0,
    'longitude': 0.0
})

# Create Spark DataFrame
sdf_csv = spark.createDataFrame(df_csv, schema=schema)
display(sdf_csv)

Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit,4/27/2004,29.8830556,-97.9411111
10/10/1949 21:00,lackland afb,tx,,light,7200.0,1-2 hrs,"1949 Lackland AFB, TX. Lights racing across the sky & making 90 degree turns on a dime.",12/16/2005,29.38421,-98.581082
10/10/1955 17:00,chester (uk/england),,gb,circle,20.0,20 seconds,"Green/Orange circular disc over Chester, England",1/21/2008,53.2,-2.916667
10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,"My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I took a different route home",1/17/2004,28.9783333,-96.6458333
10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,"AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise, I was at 50ꯠ' in a ""clean"" aircraft (no ordinan",1/22/2004,21.4180556,-157.8036111
10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he's still livi,4/27/2007,36.595,-82.1888889
10/10/1965 21:00,penarth (uk/wales),,gb,circle,180.0,about 3 mins,penarth uk circle 3mins stayed 30ft above me for 3 mins slowly moved of and then with the blink of the eye the speed was unreal,2/14/2006,51.434722,-3.18
10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish color disk/saucer was observed hovering above power transmission lines.,10/2/1999,41.1175,-73.4083333
10/10/1966 20:00,pell city,al,us,disk,180.0,3 minutes,"Strobe Lighted disk shape object observed close, at low speeds, and low altitude in Oct 1966 in Pell City Alabama",3/19/2009,33.5861111,-86.2861111
10/10/1966 21:00,live oak,fl,us,disk,120.0,several minutes,Saucer zaps energy from powerline as my pregnant mother receives mental signals not to pass info,5/11/2005,30.2947222,-82.9841667


In [0]:
sdf_us = sdf.filter(sdf['country'] == 'us')
display(sdf_us)

Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit,4/27/2004,29.8830556,-97.9411111
10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,"My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I took a different route home",1/17/2004,28.9783333,-96.6458333
10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,"AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise, I was at 50ꯠ' in a ""clean"" aircraft (no ordinan",1/22/2004,21.4180556,-157.8036111
10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he's still livi,4/27/2007,36.595,-82.1888889
10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish color disk/saucer was observed hovering above power transmission lines.,10/2/1999,41.1175,-73.4083333
10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,"Strobe Lighted disk shape object observed close, at low speeds, and low altitude in Oct 1966 in Pell City Alabama",3/19/2009,33.5861111,-86.2861111
10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregnant mother receives mental signals not to pass info,5/11/2005,30.2947222,-82.9841667
10/10/1968 13:00,hawthorne,ca,us,circle,300,5 min.,"ROUND , ORANGE , WITH WHAT I WOULD SAY WAS POLISHED METAL OF SOME KIND AROUND THE EDGES .",10/31/2003,33.9163889,-118.3516667
10/10/1968 19:00,brevard,nc,us,fireball,180,3 minutes,silent red /orange mass of energy floated by three of us in western North Carolina in the 60s,6/12/2008,35.2333333,-82.7344444
10/10/1970 16:00,bellmore,ny,us,disk,1800,30 min.,silver disc seen by family and neighbors,5/11/2000,40.6686111,-73.5275


# Save the US dataframe into a delta table

In [0]:
sdf_us.write.format("delta").mode("overwrite").saveAsTable("test_schema.ufo_sighting_data")

# Filter original dataset for Great Britain dataset

In [0]:
# Filter from sdf only when country = 'gb'
sdf_gb = sdf.filter(sdf['country'] == 'gb')
display(sdf_gb)

Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,"Green/Orange circular disc over Chester, England",1/21/2008,53.2,-2.916667
10/10/1965 21:00,penarth (uk/wales),,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above me for 3 mins slowly moved of and then with the blink of the eye the speed was unreal,2/14/2006,51.434722,-3.18
10/10/1974 21:30,cardiff (uk/wales),,gb,disk,1200,20 minutes,"back in 1974 I was 19 at the time and lived in a suburb of Cardiff Wales UK called Ely, and in the distance there was a wood called Ca",2/1/2007,51.5,-3.2


In [0]:
# append sdf_gb to the delta table
sdf_gb.write.format("delta").mode("append").saveAsTable("test_schema.ufo_sighting_data")

# Save other countries

In [0]:
# from sdf add the all other different than us and gb into the delta table
sdf_others = sdf_csv.filter((sdf_csv['country'] != 'us') & (sdf_csv['country'] != 'gb'))
display(sdf_others)

Date_time,city,state/province,country,UFO_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
10/10/1994 23:00,toronto (greater toronto area) (canada),on,ca,sphere,3600.0,~1 hour,Large rusty sphere,7/3/2013,43.666667,-79.416667
10/10/1998 22:30,st. john's (canada),nf,ca,egg,7200.0,2 hours,Started off as 3 points of intense yellow light in triangle formation - then grew larger - it becage a single egg shape - VERY bright.,12/2/2000,47.55,-52.666667
10/10/2000 07:30,victoria (canada),bc,ca,cylinder,30.0,30seconds,Smooth Shiny Cylinder,12/2/2000,46.216667,-63.483333
10/10/2001 04:33,"sydney (nsw, australia)",,au,formation,180.0,3 minutes,formation and impact,11/20/2001,-33.861481,151.205475
10/10/2001 20:10,vancouver (canada),bc,ca,other,300.0,+5 minutes,I observed an green object significantly above a house with the address ((deleted)) Dunbar Street.,5/12/2011,49.25,-123.133333
10/10/2002 04:00,adelaide (pt. wakefield) (south australia),,au,circle,600.0,10 mins,one light became 3,10/28/2002,-34.928661,138.598633
10/10/2002 19:45,victoria (canada),bc,ca,unknown,120.0,2 minutes approx,bright white light with black outline around it moving soundlessly and slowly from north banking east and disappearing upward.,10/15/2002,46.216667,-63.483333
10/10/2004 03:50,portage la prairie (canada),mb,ca,changing,1200.0,20minutes,"Series of Green Blue Red White lights spherical or triangular formation SE of Portage la Prairie, Manitoba Canada",10/27/2004,49.966667,-98.3
10/10/2004 09:45,nobel (canada),on,ca,unknown,300.0,5:00,Floating Red Object,10/27/2004,45.416667,-80.1
10/10/2008 02:00,london (canada),on,ca,other,120.0,2 min. approx,"C shape with a T front over London ont on Oct 10, 2008 on a clear day around 2PM",6/9/2009,42.983333,-81.25


In [0]:
# Get the schema of the existing table
existing_table_schema = spark.table("test_schema.ufo_sighting_data").schema

# Cast the DataFrame to match the schema of the existing table
sdf_others_casted = sdf_others.select(
    [sdf_others[col.name].cast(col.dataType) for col in existing_table_schema]
)

# Write the DataFrame to the Delta table
sdf_others_casted.write.format("delta").mode("append").saveAsTable("test_schema.ufo_sighting_data")