In [None]:
import pandas as pd
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
df_train = pd.read_csv('../data/pid_train.csv', low_memory=False)
df_test = pd.read_csv('../data/pid_test.csv', low_memory=False)

In [None]:
# Look a the first few rows
print(df_train.head())
print(df_test.head())

In [None]:
# Fitting models requires us to split datasets into what we want to use as predictors and what we want to predict
predictor_variables = ['p', 'theta', 'beta', 'nphe', 'ein', 'eout']
outcome_variable = 'id'

# The variables we want to use in prediction are conventionally named X
X_train = df_train[predictor_variables]
X_test = df_test[predictor_variables]

# The outcome categories should be sequential integers (0, 1, 2, etc.)
# So we'll order the unique set of particle ids/types alphabetically...
outcome_values = sorted(list(set(df_train['id'])))

# ...and then use that to create the outcome vectors (conventionally named y)
y_train = [outcome_values.index(_id) for _id in df_train[outcome_variable]]
y_test = [outcome_values.index(_id) for _id in df_test[outcome_variable]]

In [None]:

# X_train is still a dataframe so you can use .head() to look at the first 5 rows
print(X_train.head())
print(X_train.describe())

# y_train is a vector so you can use [i:j] syntax to look at the first 5 values
print(y_train[:5])
print(Counter(y_train))

In [None]:
# And the same for test
print(X_test.head())
print(X_test.describe())
print(y_test[:5])

In [None]:
# Now we can fit the multinomial logistic regression model and print the results on the test set
model = LogisticRegression(random_state=1234, max_iter=100, verbose=0)
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

In [None]:
# And the same with a random forest classifier
classifier = RandomForestClassifier(n_estimators=10, random_state=1234, verbose=1)
classifier.fit(X_train, y_train)
print(classification_report(y_test, classifier.predict(X_test)))

In [None]:
# Now do experimentation by varying
#   a) the scaled/unscaled variables
#   b) the balanced/unbalanced training set
#   c) the unbalanced training set but setting the class_weight argument of the models to 'balanced'
#   d) other model parameters