<a href="https://colab.research.google.com/github/Mono-Blaine/CMPS-261-Project/blob/master/261Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_squared_log_error, accuracy_score
from sklearn.model_selection import cross_val_score, KFold

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier
# from sklearn.neural_network import MLPRegressor

from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from keras.metrics import Accuracy

from keras.callbacks import ModelCheckpoint

# load the dataset
# header=None prevents the first row from becoming column names
data = pd.read_csv('HIGGS_train.csv', header=None, low_memory=False)

In [9]:
# removes rows that aren't of type float. however, removes a lot so something wrong may be happening
# for i in range(29):
#     data = data[[not isinstance(value, str) for value in data[i]]]
#     data = data[[isinstance(value, float) for value in data[i]]]

# removes columns that aren't all float
# data2=data2.select_dtypes(include=['float64'])
# data2=data2.select_dtypes(exclude=['object'])

print(data.shape)

def filter(value):
    if isinstance(value, str) and not value.isnumeric():
        try:
            return float(value.replace('"','').replace('\'',''))
        except:
            return 0;
    return value

data = data.applymap(filter)

print(data.shape)

# get features
X = data.iloc[:, 1:]
# get output
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

(600000, 29)
(600000, 29)


In [None]:
# MLPClassifier cannot handle NaN values
# Remove the NaN values from the training and testing set and replace it by the mean using Imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X_train)
X_train = imputer.transform(X_train)

imputer = imputer.fit(X_test)
X_test = imputer.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(50, 55, 45, 40, 30), max_iter=500, solver = 'adam', alpha =0.02,  activation='relu')
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)

train_pred = mlp.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)
print("Training accuracy:", train_acc)

confusion_matrix(y_test, pred)
print(classification_report(y_test, pred))

In [52]:
# scales features
# result = (value - mean) / standardDeviation
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# fills missing values, if any
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# create the model
model = LogisticRegressionCV(max_iter=10000, penalty='l2', solver='lbfgs', Cs=[0.001, 0.01, 0.1, 1, 10, 100], cv=5)

# training
model.fit(X_train, y_train)

# predict the result of testing data
y_pred = model.predict(X_test)

# check model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# print(cross_val_score(model, X, y, cv=5, error_score='raise'))
# Accuracy: 0.6420222222222223

Accuracy: 0.6420222222222223


In [None]:
# create the XGBoost model
model = XGBClassifier(objective="binary:logistic",
                      random_state=42,
                      max_depth=6, # 0.7380666666666666
                      learning_rate=0.25)

# train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# predict the result of testing data
y_pred = model.predict(X_test)

# check model performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# do K-fold (5-fold) cross-validation
kfold = KFold(n_splits=5, shuffle=True)
kf_cv_scores = cross_val_score(model, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

In [None]:
model = Sequential([Dense(240, activation="relu"),
                    Dense(120, activation="relu"),
                    Dense(50, activation="relu"),
                    Dense(30, activation="relu"),
                    Dense(15, activation="relu"),
                    Dense(1, activation="sigmoid")
                   ])

optimizer = Adam(learning_rate=0.0005)

checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='accuracy',
    verbose=1,
    save_best_only=True,
    save_freq='epoch',
)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, callbacks=[checkpoint])

model.input_shape

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.metrics import Accuracy

model_from_file = Sequential([Dense(240, activation="relu"),
                              Dense(120, activation="relu"),
                              Dense(50, activation="relu"),
                              Dense(30, activation="relu"),
                              Dense(15, activation="relu"),
                              Dense(1, activation="sigmoid")
                             ])

optimizer = Adam(learning_rate=0.0005)

model_from_file.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model_from_file.build([32, 28])

model_from_file.load_weights('best_model.h5')

print('Training Accuracy:')
model_from_file.evaluate(X_train, y_train, verbose=2)

print('Testing Accuracy:')
model_from_file.evaluate(X_test, y_test, verbose=2)