# Data cleaning

### Importing packages

In [1]:
import os
import shutil as sh
from pathlib import Path

### Preliminary operations

In [2]:
DATA_FOLDER = "clean_data"
OUTPUT_FOLDER = "output"
STATIONS_FILE = "wa_alt_fuel_stations.csv"

# Get the absolute path
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
try:
    # Remove the existing folder
    sh.rmtree(DATA_FOLDER)
    sh.rmtree(OUTPUT_FOLDER)
except FileNotFoundError:
    print("--- No folder to remove ---")

# Create new folder for clean data
os.mkdir(DATA_FOLDER)
print(f"Folder '{DATA_FOLDER}' created.")
os.mkdir(OUTPUT_FOLDER)
print(f"Folder '{OUTPUT_FOLDER}' created.")

DATA_FOLDER += "\\"
OUTPUT_FOLDER += "\\"

Folder 'clean_data' created.
Folder 'output' created.


#### Correcting and filtering fuel stations file

In [3]:
file = open("alt_fuel_stations(Nov-10-2023).csv", "r", encoding="utf-8")                        # Input file
wa_fuel_stations = wa_e_stations = open(DATA_FOLDER + STATIONS_FILE, "w", encoding="utf-8")     # Output file

# Write CSV headers
wa_fuel_stations.write(file.readline())

row = file.readline()               # Read first line
while (row2 := file.readline()):
    row2_error = False              

    # If the row is interrupted, recover it (there can be multiple interruption)
    while("ELEC" not in row2):
        row2_error = True
        index = row2.find('",')                                     # Find the end of last interrupted string, if exists
        row = row.strip() + row2[index if index != -1 else 0 : ]    # Concatenate the row begin with the second part
        row2 = file.readline()

    if ",WA," in row: wa_e_stations.write(row)
    row = row2                                                      # Check on next cycle

file.close()
wa_e_stations.close()

# RDF creation

### Importing packages

In [4]:
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD

## Washington electrical stations

In [5]:
stations = pd.read_csv(DATA_FOLDER + STATIONS_FILE, sep=",")

ECO = Namespace("http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#")

graph = Graph()
graph.bind("elec", ECO)

In [10]:
%%time

for index, row in stations.iterrows():
    Station = URIRef(ECO[str(index)])     # Create node (prefix + id)

    # Triples
    graph.add((Station, RDF.type, ECO.Station))
    graph.add((Station, ECO['hasName'], Literal(row['Station Name'], datatype=XSD.string)))
    
    if " " in row['ZIP']: 
        print(f"Error in ZIP '{row['ZIP']}', skipped")
        continue

    ZipCode = URIRef(ECO[row['ZIP']])
    graph.add((Station, ECO['locatedIn'], ZipCode))


0 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#0 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#0
1 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#1 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#1
2 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#2 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#2
3 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#3 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#3
4 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#4 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#4
5 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#5 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#5
6 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#6 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricCars#6
7 http://www.dei.unipd.it/~poor6/db2/ontologies/2023/electricC

In [11]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(OUTPUT_FOLDER + 'stations.ttl', 'w') as file:
    file.write(graph.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 125 ms
Wall time: 136 ms


## Cars

In [8]:
cars = pd.read_csv(DATA_FOLDER + STATIONS_FILE, sep=",")
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2203 entries, 0 to 2202
Data columns (total 74 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Fuel Type Code                           2203 non-null   object 
 1   Station Name                             2203 non-null   object 
 2   Street Address                           2203 non-null   object 
 3   Intersection Directions                  94 non-null     object 
 4   City                                     2203 non-null   object 
 5   State                                    2203 non-null   object 
 6   ZIP                                      2203 non-null   object 
 7   Plus4                                    0 non-null      float64
 8   Station Phone                            2186 non-null   object 
 9   Status Code                              2203 non-null   object 
 10  Expected Date                            201 non

In [9]:
# Construct the country and the movie ontology namespaces not known by RDFlib

#create the graph
graph = Graph()

# Bind the namespaces to a prefix for more readable output
graph.bind("xsd", XSD)
graph.bind("countries", CNS)
graph.bind("eco", ECO)

NameError: name 'CNS' is not defined

In [None]:
%%time

