# Data cleaning

### Importing packages

In [1]:
import os
import shutil as sh
from pathlib import Path

### Preliminary operations

In [2]:
DATA_FOLDER = "clean_data"
OUTPUT_FOLDER = "output"
STATIONS_FILE = "wa_alt_fuel_stations.csv"
CARS_FILE = "Electric_Vehicle_Population_Data.csv"
WAGE_FILE = "20zpallnoagi.csv"

# Get the absolute path
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
try:
    # Remove the existing folder
    sh.rmtree(DATA_FOLDER)
    sh.rmtree(OUTPUT_FOLDER)
except FileNotFoundError:
    print("--- No folder to remove ---")

# Create new folder for clean data
os.mkdir(DATA_FOLDER)
print(f"Folder '{DATA_FOLDER}' created.")
os.mkdir(OUTPUT_FOLDER)
print(f"Folder '{OUTPUT_FOLDER}' created.")

DATA_FOLDER += "\\"
OUTPUT_FOLDER += "\\"

Folder 'clean_data' created.
Folder 'output' created.


#### Correcting and filtering fuel stations file

In [3]:
file = open("alt_fuel_stations(Nov-10-2023).csv", "r", encoding="utf-8")                        # Input file
wa_fuel_stations = wa_e_stations = open(DATA_FOLDER + STATIONS_FILE, "w", encoding="utf-8")     # Output file

# Write CSV headers
wa_fuel_stations.write(file.readline())

row = file.readline()               # Read first line
while (row2 := file.readline()):
    row2_error = False              

    # If the row is interrupted, recover it (there can be multiple interruption)
    while("ELEC" not in row2):
        row2_error = True
        index = row2.find('",')                                     # Find the end of last interrupted string, if exists
        row = row.strip() + row2[index if index != -1 else 0 : ]    # Concatenate the row begin with the second part
        row2 = file.readline()

    if ",WA," in row: wa_e_stations.write(row)
    row = row2                                                      # Check on next cycle

file.close()
wa_e_stations.close()

# RDF creation

### Importing packages

In [4]:
import pandas as pd
from urllib.parse import quote
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD

## ZIP codes, cities, counties

In [5]:
places = pd.read_csv("wa_zips_cities_counties.csv", sep=",")

ECO = Namespace("http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#")

graph = Graph()
graph.bind("elec", ECO)

In [6]:
%%time

for index, row in places.iterrows():
    ZIP = URIRef(ECO[str(row['Zipcode'])])
    City = URIRef(ECO[quote(str(row['City']))])
    County = URIRef(ECO[quote(str(row['County']))])

    graph.add((ZIP, RDF.type, ECO.ZIP))
    graph.add((City, RDF.type, ECO.City))
    graph.add((County, RDF.type, ECO.County))

    graph.add((ZIP, ECO["ofCity"], City))
    graph.add((City, ECO["belongsTo"], County))

CPU times: user 144 ms, sys: 587 µs, total: 144 ms
Wall time: 145 ms


In [7]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'locations.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: user 133 ms, sys: 0 ns, total: 133 ms
Wall time: 139 ms


## Washington electrical stations

In [8]:
stations = pd.read_csv(DATA_FOLDER + STATIONS_FILE, sep=",")

graph = Graph()
graph.bind("elec", ECO)

In [9]:
%%time

for index, row in stations.iterrows():
    Station = URIRef(ECO[str(index)])     # Create node (prefix + id)

    # Triples
    graph.add((Station, RDF.type, ECO.Station))
    graph.add((Station, ECO['hasName'], Literal(row['Station Name'], datatype=XSD.string)))
    
    if " " in row['ZIP']: 
        print(f"Error in ZIP '{row['ZIP']}', skipped")
        continue

    ZipCode = URIRef(ECO[row['ZIP']])
    graph.add((Station, ECO['locatedIn'], ZipCode))


Error in ZIP 'G9N 0', skipped
CPU times: user 404 ms, sys: 4.18 ms, total: 408 ms
Wall time: 406 ms


In [10]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'stations.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: user 345 ms, sys: 183 µs, total: 345 ms
Wall time: 343 ms


## Cars

In [11]:
cars = pd.read_csv(CARS_FILE, sep=",")
cars.info()

graph = Graph()      #Graph redefine, so to keep separate working spaces.
graph.bind("elec", ECO)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150482 entries, 0 to 150481
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         150482 non-null  object 
 1   County                                             150479 non-null  object 
 2   City                                               150479 non-null  object 
 3   State                                              150482 non-null  object 
 4   Postal Code                                        150479 non-null  float64
 5   Model Year                                         150482 non-null  int64  
 6   Make                                               150482 non-null  object 
 7   Model                                              150482 non-null  object 
 8   Electric Vehicle Type                              150482 non-null  object

In [12]:
%%time


#urllib.parse.quote(...)
#urllib.parse.unquote(...)

for index, row in cars.iterrows():
    Car = URIRef(ECO[str(row['DOL Vehicle ID'])])     # Create node (prefix + id)
    
    EncodedModel = quote(row['Model'])
    ModelYear = URIRef(ECO[ EncodedModel + str(row['Model Year']) ])  #2012 'Panda' => Panda2012 <---USO ENC.MOD.
    
    # Car-instance specific insertions
    graph.add((Car, RDF.type, ECO.Car))
    graph.add((Car, ECO['hasRange'], Literal(row['Electric Range'], datatype=XSD.integer) ))
    graph.add((Car, ECO['hasModelYear'], ModelYear))
    
    graph.add((ModelYear, RDF.type, ECO.ModelYear))
    graph.add((ModelYear, ECO['year'], Literal(row['Model Year'], datatype=XSD.gYear) ))
    graph.add((ModelYear, ECO['hasMSRP'], Literal(row['Base MSRP'], datatype=XSD.integer) ))
    #Model = URIRef(ECO[row['Model']])
    Model = URIRef(ECO[EncodedModel])  #<--- USO ENC.MOD.
    graph.add((ModelYear, ECO['ofModel'], Model))
    graph.add((Model, RDF.type, ECO.Model))
    #Maker = URIRef(ECO[row['Make']])
    Maker = URIRef(ECO[quote(row['Make'])])   #<---ENCODED
    graph.add((Model, ECO['madeBy'], Maker))
    graph.add((Maker, RDF.type, ECO.Maker))
        
    

CPU times: user 49.7 s, sys: 411 ms, total: 50.1 s
Wall time: 50.1 s


In [13]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'cars.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))


--- saving serialization ---
CPU times: user 23.6 s, sys: 80 ms, total: 23.7 s
Wall time: 23.8 s


## Wage per ZIP code

In [14]:
salaries = pd.read_csv(WAGE_FILE, sep=",")
salaries.info()

graph = Graph()      #Graph redefine, so to keep separate working spaces.
graph.bind("elec", ECO)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27744 entries, 0 to 27743
Columns: 165 entries, STATEFIPS to A12000
dtypes: float64(161), int64(3), object(1)
memory usage: 34.9+ MB


In [15]:
%%time

for index, row in salaries.iterrows():
    # Exclude non Washington data
    if "WA" not in row['STATE'] or '98' not in str(row['ZIPCODE']):
        continue

    Zipcode = URIRef(ECO[str(row['ZIPCODE'])])
    
    agi = float(row['A00100'])/float(row['N2'])

    graph.add((Zipcode, ECO['hasAgi'], Literal(agi, datatype=XSD.float)))

CPU times: user 1.99 s, sys: 76 ms, total: 2.07 s
Wall time: 2.06 s


In [16]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'agi.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: user 38.4 ms, sys: 4.03 ms, total: 42.4 ms
Wall time: 41.7 ms
