In [None]:
!pip install pandas numpy matplotlib sparqlwrapper

In [None]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from SPARQLWrapper import SPARQLWrapper, CSV

## Write a query!

In [None]:
query = """
prefix snomedct: <http://purl.bioontology.org/ontology/SNOMEDCT/>
prefix roo: <http://www.cancerdata.org/roo/>
prefix ncit: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>




select ?patientIdentifier ?age ?sex ?diseaseDate ?location ?tumourType



where {
?patient a snomedct:116154003;
ncit:P395 ?patientIdentifier. #targetCode
?patient roo:P100000 ?ageClass. #has age
?ageClass roo:P100042 ?age.
?patient roo:P100018 ?sexClass. #has biological sex
?sexClass roo:P100042 ?sexValue.
BIND( xsd:string(IF(?sexValue = 2, "female", "male")) AS ?sex).
?patient roo:P100008 ?tumour.#has neoplasm
?tumour ncit:P395 ?diseaseEpisode;
roo:P100042 ?tumourTypeClass;
roo:P100257 [ #has disease date
roo:P100042 ?diseaseDate];
roo:P100202 [ #has location
rdf:type snomedct:272451001;
roo:P100042 ?locationClass
].
BIND (COALESCE(
IF(?locationClass = "C500", "Areola", 1/0),
IF(?locationClass = "C501", "Central Part", 1/0),
IF(?locationClass = "C502", "medial upper quadrant", 1/0),
IF(?locationClass = "C503", "medial lower quadrant", 1/0),
IF(?locationClass = "C504", "lateral upper quadrant", 1/0),
IF(?locationClass = "C505", "lateral lower quadrant", 1/0),
IF(?locationClass = "C506", "axillary tail", 1/0),
IF(?locationClass = "C508", "overlap", 1/0),
IF(?locationClass = "C509", "unspecified", 1/0),
"F") AS ?location)
BIND (COALESCE(
IF(?tumourTypeClass = 501300, xsd:string("Invasive mammary carcinoma"), 1/0),
IF(?tumourTypeClass = 502200, xsd:string("Ductal carcinoma in situ"), 1/0),
IF(?tumourTypeClass = 503200, xsd:string("Lobular carcinoma in situ"), 1/0),
"F" ) AS ?tumourType)

}
"""

In [None]:
sparql = SPARQLWrapper('https://rdf.endpoint', returnFormat=CSV)
sparql.setQuery(query)

result = sparql.query().convert().decode()

data = pd.read_csv(io.StringIO(result))

In [None]:
# How many datapoints do we have and how complete are they?
print(f'Number of rows in the data: {len(data)}')
print(f'Number of rows with no missing values: {len(data.dropna(axis=0, how="any"))}')
print('Number of missing values in each column:')
print(data.apply(pd.isna, axis=1).agg('sum', axis=0))


In [None]:
# What kind of columns do we have?
print('List of numeric columns:')
print(list(data.select_dtypes(include=[np.number]).columns))

print('List of non-numeric columns:')
non_numerics = list(data.select_dtypes(exclude=[np.number]).columns)
print(non_numerics)

In [None]:
# What's in the numeric columns?
def make_hists(df, fig_kwargs=None, hist_kwargs=None,
               style_cycle=None):
    '''
    https://stackoverflow.com/questions/39262630/pandas-plot-hist-sharex-false-does-not-behave-as-expected
    Parameters
    ----------
    df : pd.DataFrame
        Datasource

    fig_kwargs : dict, optional
        kwargs to pass to `plt.subplots`

        defaults to {'fig_size': (4, 1.5*len(df.columns),
                     'tight_layout': True}

    hist_kwargs : dict, optional
        Extra kwargs to pass to `ax.hist`, defaults
        to `{'bins': 'auto'}

    style_cycle : cycler
        Style cycle to use, defaults to 
        mpl.rcParams['axes.prop_cycle']

    Returns
    -------
    fig : mpl.figure.Figure
        The figure created

    ax_list : list
        The mpl.axes.Axes objects created 

    arts : dict 
        maps column names to the histogram artist
    '''
    if style_cycle is None:
        style_cycle = mpl.rcParams['axes.prop_cycle']

    if fig_kwargs is None:
        fig_kwargs = {}
    if hist_kwargs is None:
        hist_kwargs = {}

    hist_kwargs.setdefault('log', True)
    # this requires nmupy >= 1.11
    hist_kwargs.setdefault('bins', 'auto')
    cols = df.columns

    fig_kwargs.setdefault('figsize', (4, 1.5*len(cols)))
    fig_kwargs.setdefault('tight_layout', True)
    fig, ax_lst = plt.subplots(len(cols), 1, **fig_kwargs)
    arts = {}
    for ax, col, sty in zip(ax_lst, cols, style_cycle()):
        h = ax.hist(col, data=df, **hist_kwargs, **sty)
        ax.legend()

        arts[col] = h

    return fig, list(ax_lst), arts

make_hists(data.select_dtypes(include=[np.number]))

In [None]:
# Any further general information about the columns
data.describe()

In [None]:
# What kind of data is in the non-numeric columns?
print('Number of unique values in non-numeric columns:')
print(data[non_numerics].agg([
    lambda x: len(x.unique()), 
]))
print()
print('what these unique values look like (truncated for readability):')
for col in data[non_numerics].columns:
    print(f'{col}:')
    print(list(data[col][:10]))

In [None]:
# How many times we see the individual values in each columns:
for col in data[non_numerics].columns:
    print(f'{col}:')
    print(data[col].value_counts().sort_values(ascending=False)[:10])
    print()