# Data cleaning

### Importing packages

In [11]:
import os
import shutil as sh
from pathlib import Path

### Preliminary operations

In [12]:
DATA_FOLDER = "clean_data"
OUTPUT_FOLDER = "output"
STATIONS_FILE = "wa_alt_fuel_stations.csv"
CARS_FILE = "Electric_Vehicle_Population_Data.csv"
WAGE_FILE = "20zpallnoagi.csv"

# Get the absolute path
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
try:
    # Remove the existing folder
    sh.rmtree(DATA_FOLDER)
    sh.rmtree(OUTPUT_FOLDER)
except FileNotFoundError:
    print("--- No folder to remove ---")

# Create new folder for clean data
os.mkdir(DATA_FOLDER)
print(f"Folder '{DATA_FOLDER}' created.")
os.mkdir(OUTPUT_FOLDER)
print(f"Folder '{OUTPUT_FOLDER}' created.")

DATA_FOLDER += "\\"
OUTPUT_FOLDER += "\\"

Folder 'clean_data' created.
Folder 'output' created.


#### Correcting and filtering fuel stations file

In [13]:
file = open("alt_fuel_stations(Nov-10-2023).csv", "r", encoding="utf-8")                        # Input file
wa_fuel_stations = wa_e_stations = open(DATA_FOLDER + STATIONS_FILE, "w", encoding="utf-8")     # Output file

# Write CSV headers
wa_fuel_stations.write(file.readline())

row = file.readline()               # Read first line
while (row2 := file.readline()):
    row2_error = False              

    # If the row is interrupted, recover it (there can be multiple interruption)
    while("ELEC" not in row2):
        row2_error = True
        index = row2.find('",')                                     # Find the end of last interrupted string, if exists
        row = row.strip() + row2[index if index != -1 else 0 : ]    # Concatenate the row begin with the second part
        row2 = file.readline()

    if ",WA," in row: wa_e_stations.write(row)
    row = row2                                                      # Check on next cycle

file.close()
wa_e_stations.close()

# RDF creation

### Importing packages

In [14]:
import pandas as pd
from urllib.parse import quote
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD

## ZIP codes, cities, counties

In [15]:
places = pd.read_csv("wa_zips_cities_counties.csv", sep=",")

ECO = Namespace("http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#")

graph = Graph()
graph.bind("elec", ECO)

In [16]:
%%time

for index, row in places.iterrows():
    ZIP = URIRef(ECO[str(row['Zipcode'])])
    City = URIRef(ECO[quote(str(row['City']))])
    County = URIRef(ECO[quote(str(row['County']))])

    graph.add((ZIP, RDF.type, ECO.ZIP))
    graph.add((City, RDF.type, ECO.City))
    graph.add((County, RDF.type, ECO.County))

    graph.add((ZIP, ECO["ofCity"], City))
    graph.add((City, ECO["belongsTo"], County))

CPU times: total: 109 ms
Wall time: 101 ms


In [17]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'locations.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 34.4 ms


## Washington electrical stations

In [None]:
stations = pd.read_csv(DATA_FOLDER + STATIONS_FILE, sep=",")

graph = Graph()
graph.bind("elec", ECO)

In [None]:
%%time

for index, row in stations.iterrows():
    Station = URIRef(ECO[str(index)])     # Create node (prefix + id)

    # Triples
    graph.add((Station, RDF.type, ECO.Station))
    graph.add((Station, ECO['hasName'], Literal(row['Station Name'], datatype=XSD.string)))
    
    if " " in row['ZIP']: 
        print(f"Error in ZIP '{row['ZIP']}', skipped")
        continue

    ZipCode = URIRef(ECO[row['ZIP']])
    graph.add((Station, ECO['locatedIn'], ZipCode))


In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'stations.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

## Cars

In [None]:
cars = pd.read_csv(CARS_FILE, sep=",")
cars.info()

graph = Graph()      #Graph redefine, so to keep separate working spaces.
graph.bind("elec", ECO)

In [None]:
%%time


#urllib.parse.quote(...)
#urllib.parse.unquote(...)

for index, row in cars.iterrows():
    Car = URIRef(ECO[str(row['DOL Vehicle ID'])])     # Create node (prefix + id)
    
    EncodedModel = quote(row['Model'])
    ModelYear = URIRef(ECO[ EncodedModel + str(row['Model Year']) ])  #2012 'Panda' => Panda2012 <---USO ENC.MOD.
    
    # Car-instance specific insertions
    graph.add((Car, RDF.type, ECO.Car))
    graph.add((Car, ECO['hasRange'], Literal(row['Electric Range'], datatype=XSD.integer) ))
    graph.add((Car, ECO['hasModelYear'], ModelYear))
    
    #Was this ModelYear already defined?
    if not graph.value(ModelYear, RDF.type, None):
        graph.add((ModelYear, RDF.type, ECO.ModelYear))
        graph.add((ModelYear, ECO['year'], Literal(row['Model Year'], datatype=XSD.gYear) ))
        graph.add((ModelYear, ECO['hasMSRP'], Literal(row['Base MSRP'], datatype=XSD.integer) ))
        #Model = URIRef(ECO[row['Model']])
        Model = URIRef(ECO[EncodedModel])  #<--- USO ENC.MOD.
        graph.add((ModelYear, ECO['ofModel'], Model))
        #Was this model used before?
        if not graph.value(Model, RDF.type, None):
            graph.add((Model, RDF.type, ECO.Model))
            #Maker = URIRef(ECO[row['Make']])
            Maker = URIRef(ECO[quote(row['Make'])])   #<---ENCODED
            graph.add((Model, ECO['madeBy'], Maker))
            #Was this maker already inserted?
            if not graph.value(Maker, RDF.type, None):
                graph.add((Maker, RDF.type, ECO.Maker)) #OPZIONALMENTE, al posto di check+add usare set(..), metodo analogo ad add ma che non genera triple doppie (con stesso soggetto e predicato sovrascrive oggetto)
        
    

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'cars.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))


## Wage per ZIP code

In [30]:
salaries = pd.read_csv(WAGE_FILE, sep=",")
salaries.info()

graph = Graph()      #Graph redefine, so to keep separate working spaces.
graph.bind("elec", ECO)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27744 entries, 0 to 27743
Columns: 165 entries, STATEFIPS to A12000
dtypes: float64(161), int64(3), object(1)
memory usage: 34.9+ MB


In [31]:
%%time

for index, row in salaries.iterrows():
    # Exclude non Washington data
    if "WA" not in row['STATE'] or '98' not in str(row['ZIPCODE']):
        continue

    Zipcode = URIRef(ECO[str(row['ZIPCODE'])])
    
    agi = float(row['A00100'])/float(row['N2'])

    graph.add((Zipcode, ECO['hasAgi'], Literal(agi, datatype=XSD.float)))

CPU times: total: 531 ms
Wall time: 526 ms


In [32]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'agi.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 9.02 ms
