In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
def analyze_tracks( truth, submission):
        """Compute the majority particle, hit counts, and weight for each track.

        Parameters
        ----------
        truth : pandas.DataFrame
            Truth information. Must have hit_id, particle_id, and weight columns.
        submission : pandas.DataFrame
            Proposed hit/track association. Must have hit_id and track_id columns.

        Returns
        -------
        pandas.DataFrame
            Contains track_id, nhits, major_particle_id, major_particle_nhits,
            major_nhits, and major_weight columns.
        """

        # true number of hits for each particle_id
        particles_nhits = truth['particle_id'].value_counts(sort=False)
        total_weight = truth['weight'].sum()
        # combined event with minimal reconstructed and truth information
        event = pd.merge(truth[['hit_id', 'particle_id', 'weight']],
                             submission[['hit_id', 'track_id']],
                             on=['hit_id'], how='left', validate='one_to_one')
        event.drop('hit_id', axis=1, inplace=True)
        event.sort_values(by=['track_id', 'particle_id'], inplace=True)

        # ASSUMPTIONs: 0 <= track_id, 0 <= particle_id

        tracks = []
        # running sum for the reconstructed track we are currently in
        rec_track_id = -1
        rec_nhits = 0
        # running sum for the particle we are currently in (in this track_id)
        cur_particle_id = -1
        cur_nhits = 0
        cur_weight = 0
        # majority particle with most hits up to now (in this track_id)
        maj_particle_id = -1
        maj_nhits = 0
        maj_weight = 0

        for hit in tqdm(event.itertuples(index=False)):
            # we reached the next track so we need to finish the current one
            if (rec_track_id != -1) and (rec_track_id != hit.track_id):
                # could be that the current particle is the majority one
                if maj_nhits < cur_nhits:
                    maj_particle_id = cur_particle_id
                    maj_nhits = cur_nhits
                    maj_weight = cur_weight
                # store values for this track
                tracks.append((rec_track_id, rec_nhits, maj_particle_id,
                    particles_nhits[maj_particle_id], maj_nhits,
                    maj_weight / total_weight))

            # setup running values for next track (or first)
            if rec_track_id != hit.track_id:
                rec_track_id = hit.track_id
                rec_nhits = 1
                cur_particle_id = hit.particle_id
                cur_nhits = 1
                cur_weight = hit.weight
                maj_particle_id = -1
                maj_nhits = 0
                maj_weights = 0
                continue

            # hit is part of the current reconstructed track
            rec_nhits += 1

            # reached new particle within the same reconstructed track
            if cur_particle_id != hit.particle_id:
                # check if last particle has more hits than the majority one
                # if yes, set the last particle as the new majority particle
                if maj_nhits < cur_nhits:
                    maj_particle_id = cur_particle_id
                    maj_nhits = cur_nhits
                    maj_weight = cur_weight
                # reset runnig values for current particle
                cur_particle_id = hit.particle_id
                cur_nhits = 1
                cur_weight = hit.weight
            # hit belongs to the same particle within the same reconstructed track
            else:
                cur_nhits += 1
                cur_weight += hit.weight

        # last track is not handled inside the loop
        if maj_nhits < cur_nhits:
            maj_particle_id = cur_particle_id
            maj_nhits = cur_nhits
            maj_weight = cur_weight
        # store values for the last track
        tracks.append((rec_track_id, rec_nhits, maj_particle_id,
            particles_nhits[maj_particle_id], maj_nhits, maj_weight / total_weight))

        cols = ['track_id', 'nhits',
                'major_particle_id', 'major_particle_nhits',
                'major_nhits', 'major_weight']
        return pd.DataFrame.from_records(tracks, columns=cols)

In [4]:
dataframe = pd.read_csv('/Users/glucia/Projects/DeepLearning/TrackingML/data/save/test_reco_tracks.csv')

truth = dataframe[['hit_id', 'particle_id', 'weight']].copy()
submission = dataframe[['hit_id', 'track_id']].copy()

analized_tracks = analyze_tracks(truth, submission)
purity_rec = np.divide(analized_tracks['major_nhits'], analized_tracks['nhits'])
purity_maj = np.divide(analized_tracks['major_nhits'], analized_tracks['major_particle_nhits'])
good_track_mask = (purity_rec > 0.5) & (purity_maj > 0.5)
#score = analized_tracks[good_track_mask]['major_weight'].sum()
score = analized_tracks['major_weight'].sum()

0it [00:00, ?it/s]

In [5]:
print(f'\t* Score: {score:.4f}')
print(f'\t* Purity rec: {np.mean(purity_rec[good_track_mask]):.4f}')
print(f'\t* Purity maj: {np.mean(purity_maj[good_track_mask]):.4f}')

	* Score: 0.1532
	* Purity rec: 0.7052
	* Purity maj: 0.6627
