In [None]:
# pandas is a library for storing and working with structured data
import pandas as pd
# numpy is a library for storing and working with matrices, similar to pandas but has more low level and maths fuctions
import numpy as np

In [None]:
# read in the csv file as a dataframe
df = pd.read_csv('../data/DiMuon_SingleMu_noM.csv', low_memory=False)

In [None]:
# show the first 5 lines of the dataframe
df.head(n=5)

In [None]:
# access the values in a single variable
print(df['eta1'])

# get some basic stats for that variable
df['eta1'].describe()

# note: print statements are only needed for things you want to show that are not the final command in your cell as that prints by default

In [None]:
# plot a histogram for that variable
df['eta1'].hist(bins=20)

In [None]:
# you can make new variables by using equations and functions and assigning them a name
df['sum_E1_E2'] = df['E1'] + df['E2']
df['log_E1'] = np.log(df['E1']) # log(base e)
df['sqrt_E1'] = np.sqrt(df['E1']) # square root
df['pwr.1_E1'] = df['E1'] ** .1 # raised to the power of 1/10

# Check the dataframe again to check they're there and look right
df.head()

In [None]:
# plot the histogram for E1 and see how it's very skewed to the left
df['E1'].hist(bins=30)

In [None]:
# A common way to make it account for severe right skew like this is to use the log-transformed version of the variable
# This allows to see distributions and anomalies a lot more clearly
# The bins argument indicates how many columns we want - low values show the overall shape better while high values show anomalies better
df['log_E1'].hist(bins=30)

In [None]:
# Plot a scatterplot of E1 and E2
df.plot.scatter('E1', 'pt1')
# We'll have the same issue when we do scatterplots with highly skewed variables

In [None]:
# log-transform the transverse momentum, then do the scatterplot
df['log_pt1'] = np.log(df['pt1'])
df.plot.scatter('log_pt1', 'log_E1')
# Clearly a positive, linear relationship between transverse momentum and energy for the first particle

In [None]:
# Do the same for the second particle
df['log_pt2'] = np.log(df['pt2'])
df['log_E2'] = np.log(df['E2'])
df.plot.scatter('log_pt2', 'log_E2')