In [1]:
!python --version

Python 3.11.5


In [2]:
import pandas as pd

In [3]:
# This notebook reads a file about data about countries, removes the unnecessary information, and puts the rest into RDF triples

1. read file
2. put the data in a dictionary with column names as keys and list as values
3. clean the data
4. put the data in a pandas dataframe
5. export the dataframe to csv
6. validate the csv
7.  read the validated csv with pandas
8. create OWL axioms with f-strings in python
9. export the axioms to a txt file
10. import the txt file into Protege

In [4]:
# Location of the file
filepath:str = r"C:\Users\inser\OneDrive\Industry\OntologyExercises\CountryInfo.txt"
# Open file, read every line, and put the content in a list of rows
with open(filepath) as CountryInfo_handle:
   lines = CountryInfo_handle.readlines()

In [5]:
# Checks the first two lines of the generated list
lines[:2]

['ISO\tISO3\tISO-Numeric\tfips\tCountry\tCapital\tArea(in sq km)\tPopulation\tContinent\ttld\tCurrencyCode\tCurrencyName\tPhone\tPostal Code Format\tPostal Code Regex\tLanguages\tgeonameid\tneighbours\tEquivalentFipsCode\n',
 'AD\tAND\t020\tAN\tAndorra\tAndorra la Vella\t468\t77006\tEU\t.ad\tEUR\tEuro\t376\tAD###\t^(?:AD)*(\\d{3})$\tca\t3041565\tES,FR\t\n']

In [6]:
# Creates an empty dictionary that we can fill with keys
countryInfoDictionary:dict = {}

In [7]:
# Splits the first row of the "lines" list into multiple strings, using list comprehension
columns:list = [column.strip() for column in lines[0].split("\t")]

In [8]:
# Checks the entries in the columns list
columns

['ISO',
 'ISO3',
 'ISO-Numeric',
 'fips',
 'Country',
 'Capital',
 'Area(in sq km)',
 'Population',
 'Continent',
 'tld',
 'CurrencyCode',
 'CurrencyName',
 'Phone',
 'Postal Code Format',
 'Postal Code Regex',
 'Languages',
 'geonameid',
 'neighbours',
 'EquivalentFipsCode']

In [9]:
# Adds the entries in the columns list as keys for the dictionary we created
countryInfoDictionary = {column:[] for column in columns }
countryInfoDictionary

{'ISO': [],
 'ISO3': [],
 'ISO-Numeric': [],
 'fips': [],
 'Country': [],
 'Capital': [],
 'Area(in sq km)': [],
 'Population': [],
 'Continent': [],
 'tld': [],
 'CurrencyCode': [],
 'CurrencyName': [],
 'Phone': [],
 'Postal Code Format': [],
 'Postal Code Regex': [],
 'Languages': [],
 'geonameid': [],
 'neighbours': [],
 'EquivalentFipsCode': []}

In [10]:
# Prints the entries in the list generated earlier for the first two rows after the column headers, splitting at the tab
for line in lines[1:2]:
    values:list = [value.strip() for value in line.split("\t")]
    print(values)

['AD', 'AND', '020', 'AN', 'Andorra', 'Andorra la Vella', '468', '77006', 'EU', '.ad', 'EUR', 'Euro', '376', 'AD###', '^(?:AD)*(\\d{3})$', 'ca', '3041565', 'ES,FR', '']


In [11]:
# Adds the entries in the list as values to the dictionary, ignoring the tab
for line in lines[1:]:
    values:list = [value.strip() for value in line.split("\t")]
    for column, value in zip(countryInfoDictionary, values):        
        countryInfoDictionary[column].append(value)
print(countryInfoDictionary)

{'ISO': ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KP', 'KR', 'XK', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', 'NC', 'NE', 'NF', 'NG', '

In [12]:
# Adds the information from the dictionary to a pandas dataframe
info:pd.DataFrame = pd.DataFrame(countryInfoDictionary)

In [13]:
# Prints the first five rows of information in the pandas dataframe
info[:5]

Unnamed: 0,ISO,ISO3,ISO-Numeric,fips,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
0,AD,AND,20,AN,Andorra,Andorra la Vella,468,77006,EU,.ad,EUR,Euro,376,AD###,^(?:AD)*(\d{3})$,ca,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880,9630959,AS,.ae,AED,Dirham,971,,,"ar-AE,fa,en,hi,ur",290557,"SA,OM",
2,AF,AFG,4,AF,Afghanistan,Kabul,647500,37172386,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",1149361,"TM,CN,IR,TJ,PK,UZ",
3,AG,ATG,28,AC,Antigua and Barbuda,St. John's,443,96286,,.ag,XCD,Dollar,+1-268,,,en-AG,3576396,,
4,AI,AIA,660,AV,Anguilla,The Valley,102,13254,,.ai,XCD,Dollar,+1-264,,,en-AI,3573511,,


In [14]:
# Exports the rows in the pandas data frame to a CSV file
output_file:str = "output.csv"
info.to_csv(output_file, index=False)

In [15]:
# Creates a list of OWL triples classifying each country as a country and indicating its ISO2 code
namespace:str = "urn:absolute:CountryInfoOntology#".strip().replace(" ", "")
countryISO2triples:list = []
for i in info.index:
    iso2code:str = str(info.at[i, "ISO"]).strip().replace(" ", "")
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    triple_iso2code:str = f"""<{namespace}{country}> <{namespace}hasISO2Code> <{namespace}{iso2code}> ;
                                a <{namespace}Country> .\n"""
    countryISO2triples.append(triple_iso2code)

In [16]:
# Prints the created list of triples
print(*countryISO2triples[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AD> ;
                                a <urn:absolute:CountryInfoOntology#Country> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AE> ;
                                a <urn:absolute:CountryInfoOntology#Country> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AF> ;
                                a <urn:absolute:CountryInfoOntology#Country> .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AG> ;
                                a <urn:absolute:CountryInfoOntology#Country> .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasISO2Code> <urn:absolute:CountryInfoOntology#AI> ;
    

In [17]:
# Creates a text file with the triples included
with open("countryISO2triples.txt", "w") as countryISO2triples_file:
    countryISO2triples_file.writelines(countryISO2triples)

In [18]:
# Creates another list of OWL triples classifying each country by its ISO3 code
ISO3triples:list = []
for i in info.index:
    iso3code:str = str(info.at[i, "ISO3"]).strip().replace(" ", "")
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    triple_iso3code:str = f"""<{namespace}{country}> <{namespace}hasISO3Code> <{namespace}{iso3code}> .\n"""
    ISO3triples.append(triple_iso3code)
print(*ISO3triples[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasISO3Code> <urn:absolute:CountryInfoOntology#AND> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasISO3Code> <urn:absolute:CountryInfoOntology#ARE> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasISO3Code> <urn:absolute:CountryInfoOntology#AFG> .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#hasISO3Code> <urn:absolute:CountryInfoOntology#ATG> .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasISO3Code> <urn:absolute:CountryInfoOntology#AIA> .



In [19]:
# Creates a text file with the new triples included
with open("ISO3triples.txt", "w") as ISO3triples_file:
    ISO3triples_file.writelines(ISO3triples)

In [20]:
# Creates another list of OWL triples classifying each country by its capital
countrycapitaltriples:list = []
for i in info.index:
    capital:str = str(info.at[i, "Capital"]).strip().replace(" ", "")
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    triple_capital:str = f"""<{namespace}{country}> <{namespace}hasCapital> <{namespace}{capital}> .\n"""
    countrycapitaltriples.append(triple_capital)
print(*countrycapitaltriples[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasCapital> <urn:absolute:CountryInfoOntology#AndorralaVella> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasCapital> <urn:absolute:CountryInfoOntology#AbuDhabi> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasCapital> <urn:absolute:CountryInfoOntology#Kabul> .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#hasCapital> <urn:absolute:CountryInfoOntology#St.John's> .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasCapital> <urn:absolute:CountryInfoOntology#TheValley> .



In [21]:
# Creates a text file with these triples included
with open("countrycapitaltriples.txt", "w") as countrycapitaltriples_file:
    countrycapitaltriples_file.writelines(countrycapitaltriples)

In [22]:
# Creates another list of OWL triples classifying each country by its continent
countrycontinenttriples:list = []
for i in info.index:
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    continent:str = str(info.at[i, "Continent"]).strip().replace(" ", "")
    triple_continent:str = f"""<{namespace}{country}> <{namespace}hasContinent> <{namespace}{continent}> .\n"""
    countrycontinenttriples.append(triple_continent)
print(*countrycontinenttriples[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasContinent> <urn:absolute:CountryInfoOntology#EU> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasContinent> <urn:absolute:CountryInfoOntology#AS> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasContinent> <urn:absolute:CountryInfoOntology#AS> .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#hasContinent> <urn:absolute:CountryInfoOntology#NA> .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasContinent> <urn:absolute:CountryInfoOntology#NA> .



In [23]:
# Creates a text file with these triples included
with open("countrycontinenttriples.txt", "w") as countrycontinenttriples_file:
    countrycontinenttriples_file.writelines(countrycontinenttriples)

In [24]:
# Creates another list of OWL triples classifying each country by its currency and classifying each currency as a currency
triplescurrency:list = []
for i in info.index:
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    currency = str(info.at[i, "CurrencyName"]).strip().replace(" ", "") + "-" + str(info.at[i, "CurrencyCode"]).strip().replace(" ", "")
    triple_currency:str = f"""<{namespace}{country}> <{namespace}hasCurrency> <{namespace}{currency}> .\n"""
    triplescurrency.append(triple_currency)
for i in info.index:
    currency = str(info.at[i, "CurrencyName"]).strip().replace(" ", "") + "-" + str(info.at[i, "CurrencyCode"]).strip().replace(" ", "")
    triple2_currency:str = f"""<{namespace}{currency}> a <{namespace}Currency> .\n"""
    triplescurrency.append(triple2_currency)
print(*triplescurrency[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasCurrency> <urn:absolute:CountryInfoOntology#Euro-EUR> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasCurrency> <urn:absolute:CountryInfoOntology#Dirham-AED> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasCurrency> <urn:absolute:CountryInfoOntology#Afghani-AFN> .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#hasCurrency> <urn:absolute:CountryInfoOntology#Dollar-XCD> .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#hasCurrency> <urn:absolute:CountryInfoOntology#Dollar-XCD> .



In [25]:
# Creates a text file with these triples included
with open("triplescurrency.txt", "w") as triplescurrency_file:
    triplescurrency_file.writelines(triplescurrency)

In [26]:
# Creates another list of OWL triples classifying each country by its area in sq km
triplesarea:list = []
for i in info.index:
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    area:str = str(info.at[i, "Area(in sq km)"]).strip().replace(" ", "")
    triple_area:str = f"""<{namespace}{country}> <{namespace}HasAreaSqKm> {area} .\n"""
    triplesarea.append(triple_area)
print(*triplesarea[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#HasAreaSqKm> 468 .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#HasAreaSqKm> 82880 .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#HasAreaSqKm> 647500 .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#HasAreaSqKm> 443 .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#HasAreaSqKm> 102 .



In [27]:
# Creates a text file with these triples included
with open("triplesarea.txt", "w") as triplesarea_file:
    triplesarea_file.writelines(triplesarea)

In [28]:
# Creates another list of OWL triples classifying each country by its population
triplespopulation:list = []
for i in info.index:
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    population:str = str(info.at[i, "Population"]).strip().replace(" ", "")
    triple_population:str = f"""<{namespace}{country}> <{namespace}HasPopulation> {population} .\n"""
    triplespopulation.append(triple_population)
print(*triplespopulation[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#HasPopulation> 77006 .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#HasPopulation> 9630959 .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#HasPopulation> 37172386 .
 <urn:absolute:CountryInfoOntology#AntiguaandBarbuda> <urn:absolute:CountryInfoOntology#HasPopulation> 96286 .
 <urn:absolute:CountryInfoOntology#Anguilla> <urn:absolute:CountryInfoOntology#HasPopulation> 13254 .



In [29]:
# Creates a text file with these triples included
with open("triplespopulation.txt", "w") as triplespopulation_file:
    triplespopulation_file.writelines(triplespopulation)

In [30]:
# Creates another list of OWL triples categorizing capitals, continents, iso2 codes, and iso3 codes by their respective classes
triplesclassification:list = []
for i in info.index:
    capital:str = str(info.at[i, "Capital"]).strip().replace(" ", "")
    triple_capitalclass:str = f"""<{namespace}{capital}> a <{namespace}Capital> .\n"""
    triplesclassification.append(triple_capitalclass)
for i in info.index:
    continent:str = str(info.at[i, "Continent"]).strip().replace(" ", "")
    triple_continentclass:str = f"""<{namespace}{continent}> a <{namespace}Continent> .\n"""
    triplesclassification.append(triple_continentclass)
for i in info.index:
    iso2:str = str(info.at[i, "ISO"]).strip().replace(" ", "")
    triple_iso2class:str = f"""<{namespace}{iso2}> a <{namespace}ISO2LetterCode> .\n"""
    triplesclassification.append(triple_iso2class)
for i in info.index:
    iso3:str = str(info.at[i, "ISO3"]).strip().replace(" ", "")
    triple_iso3class:str = f"""<{namespace}{iso3}> a <{namespace}ISO3LetterCode> .\n"""
    triplesclassification.append(triple_iso3class)
print(*triplesclassification[:5])

<urn:absolute:CountryInfoOntology#AndorralaVella> a <urn:absolute:CountryInfoOntology#Capital> .
 <urn:absolute:CountryInfoOntology#AbuDhabi> a <urn:absolute:CountryInfoOntology#Capital> .
 <urn:absolute:CountryInfoOntology#Kabul> a <urn:absolute:CountryInfoOntology#Capital> .
 <urn:absolute:CountryInfoOntology#St.John's> a <urn:absolute:CountryInfoOntology#Capital> .
 <urn:absolute:CountryInfoOntology#TheValley> a <urn:absolute:CountryInfoOntology#Capital> .



In [31]:
# Creates a text file with these triples included
with open("triplesclassification.txt", "w") as triplesclassification_file:
    triplesclassification_file.writelines(triplesclassification)

In [32]:
# Creates a dictionary relating ISO2 codes to countries
iso_to_country = dict(zip(info['ISO'], info['Country']))
print(iso_to_country)

{'AD': 'Andorra', 'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AL': 'Albania', 'AM': 'Armenia', 'AO': 'Angola', 'AQ': 'Antarctica', 'AR': 'Argentina', 'AS': 'American Samoa', 'AT': 'Austria', 'AU': 'Australia', 'AW': 'Aruba', 'AX': 'Aland Islands', 'AZ': 'Azerbaijan', 'BA': 'Bosnia and Herzegovina', 'BB': 'Barbados', 'BD': 'Bangladesh', 'BE': 'Belgium', 'BF': 'Burkina Faso', 'BG': 'Bulgaria', 'BH': 'Bahrain', 'BI': 'Burundi', 'BJ': 'Benin', 'BL': 'Saint Barthelemy', 'BM': 'Bermuda', 'BN': 'Brunei', 'BO': 'Bolivia', 'BQ': 'Bonaire, Saint Eustatius and Saba', 'BR': 'Brazil', 'BS': 'Bahamas', 'BT': 'Bhutan', 'BV': 'Bouvet Island', 'BW': 'Botswana', 'BY': 'Belarus', 'BZ': 'Belize', 'CA': 'Canada', 'CC': 'Cocos Islands', 'CD': 'Democratic Republic of the Congo', 'CF': 'Central African Republic', 'CG': 'Republic of the Congo', 'CH': 'Switzerland', 'CI': 'Ivory Coast', 'CK': 'Cook Islands', 'CL': 'Chile', 'CM': 'Cameroon', 'CN': 'China', '

In [33]:
# Splits the ISO2 codes mentioned in the neighbours column into a list, in a for loop. Then creates OWL triples for each country,
# linking it to its neighbors. The dictionary created above is used to call the country value for each ISO2 code.
triplesneighbors:list = []
for i in info.index:
    country:str = str(info.at[i, "Country"]).strip().replace(" ", "")
    codes = str(info.at[i, "neighbours"]).split(',')
    for j in codes:
        code1:str = iso_to_country.get(j, " ")
        code2:str = code1.strip().replace(" ", "")
        triple_neighbor:str = f"""<{namespace}{country}> <{namespace}hasNeighbor> <{namespace}{code2}> .\n"""
        triplesneighbors.append(triple_neighbor)
print(*triplesneighbors[:5])

<urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasNeighbor> <urn:absolute:CountryInfoOntology#Spain> .
 <urn:absolute:CountryInfoOntology#Andorra> <urn:absolute:CountryInfoOntology#hasNeighbor> <urn:absolute:CountryInfoOntology#France> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasNeighbor> <urn:absolute:CountryInfoOntology#SaudiArabia> .
 <urn:absolute:CountryInfoOntology#UnitedArabEmirates> <urn:absolute:CountryInfoOntology#hasNeighbor> <urn:absolute:CountryInfoOntology#Oman> .
 <urn:absolute:CountryInfoOntology#Afghanistan> <urn:absolute:CountryInfoOntology#hasNeighbor> <urn:absolute:CountryInfoOntology#Turkmenistan> .



In [34]:
# Creates a text file with these triples included
with open("triplesneighbors.txt", "w") as triplesneighbors_file:
    triplesneighbors_file.writelines(triplesneighbors)