In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../data/pid-500k.csv', low_memory=False)
df.head()

In [None]:
# Create a lookup/dictionary for the particle names
particle_dict = {
    -11: 'positron',
    211: 'pion',
    2212: 'proton',
    321: 'kaon'
}

# Use .value_counts() on categorical variables to show the count of each value
print(df['id'].value_counts())

# Use a list comprehension to overwrite the 'id' variable with the particle names
df['id'] = [particle_dict[particle_id] for particle_id in df['id'].values]

# Now it will show the names
print(df['id'].value_counts())

In [None]:
# Add in scaled versions of the predictor varaiables at this point
# We'll see why this might be useful when it comes to modelling
predictor_variables = ['p', 'theta', 'beta', 'nphe', 'ein', 'eout']
scaled_variables = [f'{var_name}_scaled' for var_name in predictor_variables]
df[scaled_variables] = StandardScaler().fit_transform(df[predictor_variables])

In [None]:
# Split the data into training and test sets, stratified by particle type/id
# Use test_size argument to decide proportion of data to use for testing
# Use random_state so you get the same splits every time you run this command
df_train, df_test = train_test_split(df, test_size=0.20, random_state=1234, stratify=df['id'])

In [None]:
# Check the number of rows is as expected
print(df_train.shape)
print(df_test.shape)

# Check the distribution of values in the particle id variable
print(df_train['id'].value_counts())
print(df_test['id'].value_counts())

# The training counts should be approximately 4 times the test counts (since we split the data 4:1)
print(df_test['id'].value_counts() * 4)

In [None]:
# np.random.choice is used to select a sample from a larger set
# Run this a few times to see the results then change some of the parameters
np.random.choice(['a', 'b', 'c', 'd', 'e'], size=3, replace=True)

In [None]:
# We can use the np.random.choice function to create balanced training data, where each type of particle has the same number of rows
# We'll do this by defining a function called undersample as we'll want to use this a few times

def undersample(dataframe, rows_per_type=100):
    # We'll do this in a loop, where each iteration samples for a different particle
    # First we make an empty list to hold the indexes we select
    selected_indexes = []
    # Then use a for loop to go through each particle type
    for particle_type in particle_dict.values():
        # get the list of all row indexes of the current particle_type
        candidate_indexes = dataframe[dataframe['id'] == particle_type].index.to_list()
        # select the rows to keep and add them on to the selected rows list
        selected_indexes.extend(np.random.choice(candidate_indexes, size=rows_per_type, replace=False))
    # Confirm that the length of selected indexes = number of particle types (4) x the rows_per_type
    assert len(selected_indexes) == len(particle_dict) * rows_per_type
    # Finally use the list of selected indexes to return a balanced version of the dataframe
    dataframe_balanced = dataframe.loc[selected_indexes]
    return dataframe_balanced

In [None]:
# We can use the function immediately like this
df_train_balanced = undersample(df_train)
# And check the resulting dataset is balanced on type
print(df_train_balanced['id'].value_counts())

# Then we can change the default number of rows per type
df_train_balanced = undersample(df_train, rows_per_type=1000)
print(df_train_balanced['id'].value_counts())

# We know positrons are the smallest group with 1189 so let's use that
df_train_balanced = undersample(df_train, rows_per_type=1189)
print(df_train_balanced['id'].value_counts())

# See what happens when you try and set the rows_per_type larger than 1189

In [None]:
# Write the training (both balanced and imbalanced) and test datasets to csv files so we can read them in for the modelling
df_train.to_csv('../data/pid_train.csv')
df_train_balanced.to_csv('../data/pid_train_balanced.csv')
df_test.to_csv('../data/pid_test.csv')