In [7]:
from itertools import chain
import os
from pathlib import Path
from random import choice

import numpy as np
import pandas as pd
from rich import print as rprint

from django.conf import settings
from visor.dj_utils import modeldict
from uwinn_ingest.ingest_uwinn import (
    read_uwinn_split,
    logger,
    format_headers,
    translate_headers,
    check_split_goodness
)
from uwinn_ingest.cases import KNOWN_BAD_SPLITS
from visor.models import Database, Sample

split_path = Path("uwinn_ingest/All_UWinn_splits_070621")
splits = list(split_path.iterdir())

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [None]:
uwinn = Database()
uwinn.name = "University of Winnipeg"
uwinn.short_name = "UWinn"
uwinn.clean()
uwinn.save()

In [None]:
modeldict(a[0])

In [None]:
a = Sample.objects.all()
len(a)

In [None]:
a = Sample.objects.values_list('filename')
len(set(a))

In [2]:
all_sids = pd.Series(
    list(chain.from_iterable(Sample.objects.values_list('sample_id')))
)

In [3]:
all_sids.value_counts()

Nan             134
Spectralon       26
Spectralon_1     16
Spectralon_0     16
LCA101           15
               ... 
ILL105_5          1
ILL105_4          1
ILL105_3          1
ILL105_2          1
Zeo201_1          1
Length: 8569, dtype: int64

In [13]:
nans = list(Sample.objects.filter(sample_id__iexact="nan"))
pd.Series(pd.Series([n.filename for n in nans]).unique()).to_csv('some_ids_missing.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [6]:
s = Sample.objects.filter(filename__iexact='22_4_MASTER_NOMAD_runs_2_NOMAD_Run_2_Tray_2_DandP_03.csv')

In [7]:
s

<QuerySet [<Sample: ART002_ART002_0_University>, <Sample: ART002_ART002_1_University>, <Sample: ART002_ART002_10_University>, <Sample: ART002_ART002_2_University>, <Sample: ART002_ART002_3_University>, <Sample: ART002_ART002_4_University>, <Sample: ART002_ART002_5_University>, <Sample: ART002_ART002_6_University>, <Sample: ART002_ART002_7_University>, <Sample: ART002_ART002_8_University>, <Sample: ART002_ART002_9_University>]>

In [None]:
sn = s.filter(sample_id__iexact='nan')

In [None]:
[s0.reflectance[:30] for s0 in s]

In [None]:
set(a)

In [None]:

samps = list(a.filter(sample_id__icontains="030"))

In [None]:
ing = pd.read_csv('uwinn_split_ingest.csv', header=None)
ing = ing.drop(columns=1)
ing.columns = ['time', 'level', 'file', 'msg_type', 'field_1', 'field_2']
errs = ing.loc[ing['level'] == 'ERROR']


In [None]:
len(errs)

In [None]:
sorted([f for f in errs.loc[errs['field_1'].str.contains('x and y')]['file']])

In [None]:
errs = errs.dropna(subset=['field_1'], axis=0)
errs = errs.drop(columns='field_2')

In [None]:
errs[['file', 'field_1']].to_csv('current_ingest_errors.csv', index=None)

In [None]:
modeldict(a[0])['sample_desc']

In [None]:
uwinn = Database.objects.get(name__icontains="winnipeg")
split_path = Path("uwinn_ingest/All_UWinn_splits_070621")
splits = list(split_path.iterdir())

In [None]:
for split in splits:
    if check_split_goodness(split) is False:
        continue
    try:
        rprint(f"[black]{split.name}")
        (
            fields,
            metadata,
            wavelengths,
            reflectance,
            split_warnings,
        ) = read_uwinn_split(split)
        for warning in split_warnings:
            rprint(f"[red]{warning}")
            logger.warning(f"{split.name},{warning}")
        headers = format_headers(metadata, fields)
        headers = translate_headers(headers, wavelengths, split.name)
        reflectance_block = reflectance.dropna(axis=1).values.T
        wavelengths = wavelengths.dropna().values
        if len(headers) > reflectance_block.shape[0]:
            headers = headers.iloc[:reflectance_block.shape[0]]
            logger.info(f"{split.name},trailing columns truncated")
        assert reflectance_block.shape == (
            len(headers),
            len(wavelengths),
        ), f"{reflectance_block.shape} != {len(wavelengths)} {len(headers)}"
        if (reflectance_block.max() > 5) and ("%" in str(metadata)):
            reflectance_block /= 100
            logger.info(f"{split.name},unit change,percent reflectance")
        for row_ix in range(len(headers)):
            data = {"reflectance": np.vstack([wavelengths, reflectance_block[row_ix]]).T}
            metadata = headers.iloc[0].to_dict() | {"origin": uwinn}
            sample = Sample(**(data | metadata))
            sample.clean()
            sample.save()
        rprint("[green bold]successful")
    except KeyboardInterrupt:
        raise
    except Exception as ex:
        logger.error(f"{split.name},{type(ex)},{ex}")
        rprint(f"[red]{type(ex)},{ex}")


In [11]:
a = np.array([1,2,3,4])

In [12]:
b = np.array([0.1, 0.2, 0.3, 0.4])

In [14]:
np.vstack([a, b]).T

array([[1. , 0.1],
       [2. , 0.2],
       [3. , 0.3],
       [4. , 0.4]])

In [None]:
len(Sample.objects.all())

In [None]:
for sample in Sample.objects.all():
    sample.delete()
