In [2]:
!python --version

Python 3.11.5


In [3]:
import pandas as pd

In [4]:
# This notebook reads a file about data about countries, removes the unnecessary information, and puts the rest into RDF triples

1. read file
2. put the data in a dictionary with column names as keys and list as values
3. clean the data
4. put the data in a pandas dataframe
5. export the dataframe to csv
6. validate the csv
7.  read the validated csv with pandas
8. create OWL axioms with f-strings in python
9. export the axioms to a txt file
10. import the txt file into Protege

https://www.geeksforgeeks.org/reading-writing-text-files-python/

In [5]:
# Location of the file
filepath:str = r"C:\Users\inser\OneDrive\Industry\OntologyExercises\CountryInfo.txt"
# Open file, read every line, and put the content in a list of rows
with open(filepath) as CountryInfo_handle:
   lines = CountryInfo_handle.readlines()

In [6]:
# Checks the first two lines of the generated list
lines[:2]

['ISO\tISO3\tISO-Numeric\tfips\tCountry\tCapital\tArea(in sq km)\tPopulation\tContinent\ttld\tCurrencyCode\tCurrencyName\tPhone\tPostal Code Format\tPostal Code Regex\tLanguages\tgeonameid\tneighbours\tEquivalentFipsCode\n',
 'AD\tAND\t020\tAN\tAndorra\tAndorra la Vella\t468\t77006\tEU\t.ad\tEUR\tEuro\t376\tAD###\t^(?:AD)*(\\d{3})$\tca\t3041565\tES,FR\t\n']

In [7]:
# Creates an empty dictionary that we can fill with keys
countryInfoDictionary:dict = {}

In [8]:
# Splits the first row of the "lines" list into multiple strings, using list comprehension
columns:list = [column.strip() for column in lines[0].split("\t")]

In [9]:
# Checks the entries in the columns list
columns

['ISO',
 'ISO3',
 'ISO-Numeric',
 'fips',
 'Country',
 'Capital',
 'Area(in sq km)',
 'Population',
 'Continent',
 'tld',
 'CurrencyCode',
 'CurrencyName',
 'Phone',
 'Postal Code Format',
 'Postal Code Regex',
 'Languages',
 'geonameid',
 'neighbours',
 'EquivalentFipsCode']

In [10]:
# Adds the entries in the columns list as keys for the dictionary we created
countryInfoDictionary = {column:[] for column in columns }
countryInfoDictionary

{'ISO': [],
 'ISO3': [],
 'ISO-Numeric': [],
 'fips': [],
 'Country': [],
 'Capital': [],
 'Area(in sq km)': [],
 'Population': [],
 'Continent': [],
 'tld': [],
 'CurrencyCode': [],
 'CurrencyName': [],
 'Phone': [],
 'Postal Code Format': [],
 'Postal Code Regex': [],
 'Languages': [],
 'geonameid': [],
 'neighbours': [],
 'EquivalentFipsCode': []}

In [11]:
# Prints the entries in the list generated earlier for the first two rows after the column headers, splitting at the tab
for line in lines[1:2]:
    values:list = [value.strip() for value in line.split("\t")]
    print(values)

['AD', 'AND', '020', 'AN', 'Andorra', 'Andorra la Vella', '468', '77006', 'EU', '.ad', 'EUR', 'Euro', '376', 'AD###', '^(?:AD)*(\\d{3})$', 'ca', '3041565', 'ES,FR', '']


In [12]:
# Adds the entries in the list as values to the dictionary, ignoring the tab
for line in lines[1:]:
    values:list = [value.strip() for value in line.split("\t")]
    for column, value in zip(countryInfoDictionary, values):        
        countryInfoDictionary[column].append(value)
print(countryInfoDictionary)

{'ISO': ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KP', 'KR', 'XK', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', 'NC', 'NE', 'NF', 'NG', '

In [13]:
# Adds the information from the dictionary to a pandas dataframe
info:pd.DataFrame = pd.DataFrame(countryInfoDictionary)

In [14]:
# Prints the first five rows of information in the pandas dataframe
info[:5]

Unnamed: 0,ISO,ISO3,ISO-Numeric,fips,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
0,AD,AND,20,AN,Andorra,Andorra la Vella,468,77006,EU,.ad,EUR,Euro,376,AD###,^(?:AD)*(\d{3})$,ca,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880,9630959,AS,.ae,AED,Dirham,971,,,"ar-AE,fa,en,hi,ur",290557,"SA,OM",
2,AF,AFG,4,AF,Afghanistan,Kabul,647500,37172386,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",1149361,"TM,CN,IR,TJ,PK,UZ",
3,AG,ATG,28,AC,Antigua and Barbuda,St. John's,443,96286,,.ag,XCD,Dollar,+1-268,,,en-AG,3576396,,
4,AI,AIA,660,AV,Anguilla,The Valley,102,13254,,.ai,XCD,Dollar,+1-264,,,en-AI,3573511,,


In [15]:
output_file:str = "output.csv"
info.to_csv(output_file, index=False)

In [18]:
namespace:str = "urn:absolute:CountryInfoOntology#"
triples:list = []
for i in info.index:
    iso2code:str = str(info.at[i, "ISO"])
    country:str = str(info.at[i, "Country"])
    triple_iso2code:str = f"<{namespace}{country}> <{namespace}hasISO2Code> <{namespace}{iso2code}> ; a <{namespace}{country}> ."
    triples.append(triple_iso2code)

In [19]:
triples

['<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AD> ; a <urn:absolute:CountryInfoOntology#Andorra> .',
 '<urn:absolute:CountryInfoOntology#United Arab Emirates> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AE> ; a <urn:absolute:CountryInfoOntology#United Arab Emirates> .',
 '<urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AF> ; a <urn:absolute:CountryInfoOntology#Afghanistan> .',
 '<urn:absolute:CountryInfoOntology#Antigua and Barbuda> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AG> ; a <urn:absolute:CountryInfoOntology#Antigua and Barbuda> .',
 '<urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AI> ; a <urn:absolute:CountryInfoOntology#Anguilla> .',
 '<urn:absolute:CountryInfoOntology

In [20]:
with open("triples.txt", "w") as triples_file:
    triples_file.writelines(triples)