In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow.keras import regularizers
from keras import Sequential

In [None]:
#Reading the data from the file HIGGS_train.csv
df = pd.read_csv('HIGGS_train.csv', low_memory=False)

#Removing the string "error" from the dataset as it is not a float value
df = df.replace(["error", "s"], float('nan'))

# Remove double quotes from columns 8 and 21
df[df.columns[8]] = df.iloc[:, 8].str.replace('"', '')
df[df.columns[21]] = df.iloc[:, 21].str.replace('"', '')

# Convert columns 8 and 21 to numeric data type (float), and replace non-numeric values with NaN
df[df.columns[8]] = pd.to_numeric(df.iloc[:, 8], errors='coerce')
df[df.columns[21]] = pd.to_numeric(df.iloc[:, 21], errors='coerce')


In [None]:
#Display the dataset information to check how many columns do we have, how many rows, and the data types of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599999 entries, 0 to 599998
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   1.00E+00    599999 non-null  float64
 1   8.69E-01    599999 non-null  float64
 2   -6.35E-01   599999 non-null  float64
 3   2.26E-01    599999 non-null  float64
 4   3.27E-01    599999 non-null  float64
 5   -6.90E-01   599999 non-null  float64
 6   7.54E-01    599999 non-null  float64
 7   -2.49E-01   599999 non-null  float64
 8   -1.09E+00   599999 non-null  float64
 9   0.00E+00    599999 non-null  float64
 10  1.37E+00    599999 non-null  float64
 11  -6.54E-01   599999 non-null  float64
 12  9.30E-01    599999 non-null  float64
 13  1.11E+00    599999 non-null  float64
 14  1.14E+00    599999 non-null  float64
 15  -1.58E+00   599999 non-null  float64
 16  -1.05E+00   599999 non-null  float64
 17  0.00E+00.1  599998 non-null  float64
 18  6.58E-01    599999 non-null  float64
 19  -1

In [None]:
#To check if the database is balanced, count the numbers of 1.00E+00 and 0.00E+00
count = len(df[df['1.00E+00'] == 1.0]) 

# Print the count
print(count)

317571


The number of 1s in the dataset turns out to be 317571, meaning that nearly half of the dataset is 0s and nearly the other half is 1s. Thus, the dataset is not imbalanced.

In [None]:
dataset = df.dropna() #Drop the rows with NaN values
dataset.info() #Display the dataset information to check how many columns do we have, how many rows, and the data types of each column after dropping the NaN values.

<class 'pandas.core.frame.DataFrame'>
Index: 599996 entries, 0 to 599998
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   1.00E+00    599996 non-null  float64
 1   8.69E-01    599996 non-null  float64
 2   -6.35E-01   599996 non-null  float64
 3   2.26E-01    599996 non-null  float64
 4   3.27E-01    599996 non-null  float64
 5   -6.90E-01   599996 non-null  float64
 6   7.54E-01    599996 non-null  float64
 7   -2.49E-01   599996 non-null  float64
 8   -1.09E+00   599996 non-null  float64
 9   0.00E+00    599996 non-null  float64
 10  1.37E+00    599996 non-null  float64
 11  -6.54E-01   599996 non-null  float64
 12  9.30E-01    599996 non-null  float64
 13  1.11E+00    599996 non-null  float64
 14  1.14E+00    599996 non-null  float64
 15  -1.58E+00   599996 non-null  float64
 16  -1.05E+00   599996 non-null  float64
 17  0.00E+00.1  599996 non-null  float64
 18  6.58E-01    599996 non-null  float64
 19  -1.05E-

In [None]:
clean_data = dataset.drop_duplicates() #Drop the duplicated rows
clean_data.info() #Display the dataset information to check how many columns do we have, how many rows, and the data types of each column after dropping the duplicated rows.

<class 'pandas.core.frame.DataFrame'>
Index: 599194 entries, 0 to 599998
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   1.00E+00    599194 non-null  float64
 1   8.69E-01    599194 non-null  float64
 2   -6.35E-01   599194 non-null  float64
 3   2.26E-01    599194 non-null  float64
 4   3.27E-01    599194 non-null  float64
 5   -6.90E-01   599194 non-null  float64
 6   7.54E-01    599194 non-null  float64
 7   -2.49E-01   599194 non-null  float64
 8   -1.09E+00   599194 non-null  float64
 9   0.00E+00    599194 non-null  float64
 10  1.37E+00    599194 non-null  float64
 11  -6.54E-01   599194 non-null  float64
 12  9.30E-01    599194 non-null  float64
 13  1.11E+00    599194 non-null  float64
 14  1.14E+00    599194 non-null  float64
 15  -1.58E+00   599194 non-null  float64
 16  -1.05E+00   599194 non-null  float64
 17  0.00E+00.1  599194 non-null  float64
 18  6.58E-01    599194 non-null  float64
 19  -1.05E-

In [None]:
clean_data.head() #Display the first 5 rows of the dataset to be able to visualize its structure

Unnamed: 0,1.00E+00,8.69E-01,-6.35E-01,2.26E-01,3.27E-01,-6.90E-01,7.54E-01,-2.49E-01,-1.09E+00,0.00E+00,...,-1.05E-02,-4.58E-02,3.10E+00,1.35E+00,9.80E-01,9.78E-01,9.20E-01,7.22E-01,9.89E-01,8.77E-01
0,1.0,0.908,0.329,0.359,1.5,-0.313,1.1,-0.558,-1.59,2.17,...,-1.14,-0.000819,0.0,0.302,0.833,0.986,0.978,0.78,0.992,0.798
1,1.0,0.799,1.47,-1.64,0.454,0.426,1.1,1.28,1.38,0.0,...,1.13,0.9,0.0,0.91,1.11,0.986,0.951,0.803,0.866,0.78
2,0.0,1.34,-0.877,0.936,1.99,0.882,1.79,-1.65,-0.942,0.0,...,-0.678,-1.36,0.0,0.947,1.03,0.999,0.728,0.869,1.03,0.958
3,1.0,1.11,0.321,1.52,0.883,-1.21,0.681,-1.07,-0.922,0.0,...,-0.374,0.113,0.0,0.756,1.36,0.987,0.838,1.13,0.872,0.808
4,0.0,1.6,-0.608,0.00707,1.82,-0.112,0.848,-0.566,1.58,2.17,...,-0.654,-1.27,3.1,0.824,0.938,0.972,0.789,0.431,0.961,0.958


In [None]:
# Define a function to replace outliers with NaN using the IQR method
def replace_outliers_iqr_with_nan(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data.where((data >= lower_bound) & (data <= upper_bound), np.nan)

# Replace outliers with NaN
data_with_outliers_replaced = replace_outliers_iqr_with_nan(clean_data)

# Display the dataset with outliers replaced by NaN
print(data_with_outliers_replaced)

# Count null values in each column
null_count = data_with_outliers_replaced.isnull().sum()
print("Number of null values in each column after outlier replacement:")
print(null_count)

        1.00E+00  8.69E-01  -6.35E-01  2.26E-01  3.27E-01  -6.90E-01   
0            1.0     0.908      0.329   0.35900     1.500     -0.313  \
1            1.0     0.799      1.470  -1.64000     0.454      0.426   
2            0.0     1.340     -0.877   0.93600     1.990      0.882   
3            1.0     1.110      0.321   1.52000     0.883     -1.210   
4            0.0     1.600     -0.608   0.00707     1.820     -0.112   
...          ...       ...        ...       ...       ...        ...   
599994       0.0     0.680      0.223  -0.75700     0.418     -0.323   
599995       1.0     1.610     -1.620   0.21200     0.716     -0.906   
599996       1.0     1.070      0.364   0.34400     0.617     -1.430   
599997       1.0     1.180     -0.173  -1.46000     0.735     -0.753   
599998       0.0     0.771     -0.133  -1.02000     1.790     -1.650   

        7.54E-01  -2.49E-01  -1.09E+00  0.00E+00  ...  -1.05E-02  -4.58E-02   
0          1.100     -0.558    -1.5900      2.17  ...   

In [None]:
new_dataset = data_with_outliers_replaced.dropna() #Drop the rows with NaN values, aka drop the outliers since they have been replaced to NaN.

In [None]:
new_dataset.isna().sum() #Making sure the outliers have been dropped

1.00E+00      0
8.69E-01      0
-6.35E-01     0
2.26E-01      0
3.27E-01      0
-6.90E-01     0
7.54E-01      0
-2.49E-01     0
-1.09E+00     0
0.00E+00      0
1.37E+00      0
-6.54E-01     0
9.30E-01      0
1.11E+00      0
1.14E+00      0
-1.58E+00     0
-1.05E+00     0
0.00E+00.1    0
6.58E-01      0
-1.05E-02     0
-4.58E-02     0
3.10E+00      0
1.35E+00      0
9.80E-01      0
9.78E-01      0
9.20E-01      0
7.22E-01      0
9.89E-01      0
8.77E-01      0
dtype: int64

In [None]:
new_dataset.info() #Display the dataset information to check how many columns do we have, how many rows, and the data types of each column after dropping the outliers.

<class 'pandas.core.frame.DataFrame'>
Index: 316034 entries, 1 to 599998
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   1.00E+00    316034 non-null  float64
 1   8.69E-01    316034 non-null  float64
 2   -6.35E-01   316034 non-null  float64
 3   2.26E-01    316034 non-null  float64
 4   3.27E-01    316034 non-null  float64
 5   -6.90E-01   316034 non-null  float64
 6   7.54E-01    316034 non-null  float64
 7   -2.49E-01   316034 non-null  float64
 8   -1.09E+00   316034 non-null  float64
 9   0.00E+00    316034 non-null  float64
 10  1.37E+00    316034 non-null  float64
 11  -6.54E-01   316034 non-null  float64
 12  9.30E-01    316034 non-null  float64
 13  1.11E+00    316034 non-null  float64
 14  1.14E+00    316034 non-null  float64
 15  -1.58E+00   316034 non-null  float64
 16  -1.05E+00   316034 non-null  float64
 17  0.00E+00.1  316034 non-null  float64
 18  6.58E-01    316034 non-null  float64
 19  -1.05E-

After removing all the outliers, we have noticed that we lost nearly half of our dataset. Thus, dropping the outliers does not seem a right nor rational thing to do. Therefore, we have decided to keep the outliers.

In [None]:
# The target column is the first one in the dataset
X = clean_data.iloc[:, 1:]  # Features (all columns except the first one)
y = clean_data.iloc[:, 0]  # Target (the first column)

scaler = MinMaxScaler()

X_normalized = scaler.fit_transform(X)

# Split the dataset into a temporary training set (which will be further divided) and a testing set
X_temp, X_test, y_temp, y_test = train_test_split(X_normalized, y, test_size=0.1, random_state=42)

# Split the temporary training set into the final training and validation sets
X_train, X_cross_val, y_train, y_cross_val = train_test_split(X_temp, y_temp, test_size=1/9, random_state=42)

# The test_size in the second split is set to 1/9 because (1/9) * 0.9 = 0.1, ensuring that the validation set is 10% of the original dataset


# Display the resulting training set
print("Training set:")
print(X_train)
print(y_train)

# Display the resulting cross-validation set
print("\nCross-validation (validation) set:")
print(X_cross_val)
print(y_cross_val)

#Display the resulting testing set
print("\nTesting set:")
print(X_test)
print(y_test)


Training set:
[[0.13455839 0.58065844 0.85057471 ... 0.18326387 0.14605636 0.16060247]
 [0.08713693 0.6882716  0.97701149 ... 0.02248039 0.05770887 0.07344728]
 [0.02418494 0.73662551 0.46896552 ... 0.04057311 0.07062877 0.09578609]
 ...
 [0.00343806 0.45576132 0.39683908 ... 0.00944191 0.05143349 0.04755458]
 [0.00794309 0.77983539 0.63390805 ... 0.11074649 0.13621262 0.19614148]
 [0.26141079 0.69506173 0.26925287 ... 0.03735011 0.07407407 0.08986292]]
404280    0.0
87279     1.0
354232    0.0
367450    1.0
161039    0.0
         ... 
463325    1.0
31381     0.0
484129    0.0
348187    0.0
284702    1.0
Name: 1.00E+00, Length: 479354, dtype: float64

Cross-validation (validation) set:
[[0.2033195  0.13580247 0.98850575 ... 0.0476051  0.06902916 0.06447792]
 [0.00806165 0.38909465 0.06034483 ... 0.01918414 0.1005291  0.12506346]
 [0.05844695 0.28806584 0.95114943 ... 0.07338905 0.05561708 0.05330851]
 ...
 [0.08713693 0.39547325 0.79022989 ... 0.102689   0.06582995 0.07073955]
 [0.0469

After splitting the dataset into training set (80%), cross validation set (10%) and testing set (10%), it is time to implement, train and test our models. The first model we are trying is an sklearn linear regression model.

In [None]:
# Create a Linear Regression model and fit it to the training data
linearmodel = LinearRegression()
linearmodel.fit(X_train, y_train)

# Predict the target values for the validation set
y_linear_cross_val_pred = linearmodel.predict(X_cross_val)

# Calculate the Mean Squared Error (MSE) and R-squared (R²) for the validation set
mse_cross_val = mean_squared_error(y_cross_val, y_linear_cross_val_pred)
r2_cross_val = r2_score(y_cross_val, y_linear_cross_val_pred)

print("Validation set performance:")
print(f"Mean Squared Error (MSE): {mse_cross_val}")
print(f"R-squared (R²): {r2_cross_val}")

# Predict the target values for the testing set
y_linear_test_pred = linearmodel.predict(X_test)

# Calculate the Mean Squared Error (MSE) and R-squared (R²) for the testing set
mse_test = mean_squared_error(y_test, y_linear_test_pred)
r2_test = r2_score(y_test, y_linear_test_pred)

print("\nTesting set performance:")
print(f"Mean Squared Error (MSE): {mse_test}")
print(f"R-squared (R²): {r2_test}")


Validation set performance:
Mean Squared Error (MSE): 0.2251016915732526
R-squared (R²): 0.0967970033167469

Testing set performance:
Mean Squared Error (MSE): 0.22504989280098028
R-squared (R²): 0.09614997678512138


As we can notice, the MSE is large enough to determine that linear regression is far from optimal. Moreover, the R_squared metric is close to 0, implying that the model does not provide any useful information about the target variable. After getting this result and looking back at the dataset, we can notice that our problem is a binary classification problem. Thus, our next intuition is to try logistic regression.

In [None]:
# Create a Logistic Regression model and fit it to the training data
logisticmodel = LogisticRegression(max_iter=1000)  
logisticmodel.fit(X_train, y_train)

# Predict the target values for the validation set
y_logistic_cross_val_pred = logisticmodel.predict(X_cross_val)

# Calculate the accuracy for the validation set
accuracy_logistic_cross_val = accuracy_score(y_cross_val, y_logistic_cross_val_pred)

print("Validation set performance:")
print(f"Accuracy: {accuracy_logistic_cross_val}")

# Predict the target values for the testing set
y_logistic_test_pred = logisticmodel.predict(X_test)

# Calculate the accuracy for the testing set
accuracy_logistic_test = accuracy_score(y_test, y_logistic_test_pred)

print("\nTesting set performance:")
print(f"Accuracy: {accuracy_logistic_test}")


Validation set performance:
Accuracy: 0.6411381842456609

Testing set performance:
Accuracy: 0.6405040053404539


The following accuracy is not much high as we are aiming for an accuracy close to the mid 70s. Thus, we might want to look at more complex models to train. The next model we are trying will be a decision tree model.

In [None]:
# Create a Decision Tree model and fit it to the training data
treemodel = DecisionTreeClassifier()
treemodel.fit(X_train, y_train)

# Predict the target values for the validation set
y_tree_cross_val_pred = treemodel.predict(X_cross_val)

# Calculate the accuracy for the validation set
accuracy_tree_cross_val = accuracy_score(y_cross_val, y_tree_cross_val_pred)

print("Validation set performance:")
print(f"Accuracy: {accuracy_tree_cross_val}")

# Predict the target values for the testing set
y_tree_test_pred = treemodel.predict(X_test)

# Calculate the accuracy for the testing set
accuracy_tree_test = accuracy_score(y_test, y_tree_test_pred)

print("\nTesting set performance:")
print(f"Accuracy: {accuracy_tree_test}")


Validation set performance:
Accuracy: 0.6402703604806409

Testing set performance:
Accuracy: 0.6352803738317757


The accuracy has not improved. In fact, it has depreciated. Thus trying other ways is required. The next try will consist of using XGBoost.

Realizing that all the models we have tried were giving a very far of 70 accuracy. We have decided to use Grid Search in order to find the optimal parameters for our XGBOOST model.

In [None]:
xgb_clf = XGBClassifier(objective="binary:logistic")

# Define the search space for parameters
param_dist = {
    'n_estimators': np.arange(100, 500, 50),
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'max_depth': np.arange(3, 10),
    'min_child_weight': np.arange(1, 10),
    'subsample': np.linspace(0.5, 1, 6),
    'colsample_bytree': np.linspace(0.5, 1, 6),
    'gamma': np.linspace(0, 0.5, 11),
    'reg_alpha': np.linspace(0, 0.5, 11),
    'reg_lambda': np.linspace(0, 0.5, 11)
}

# Create the RandomizedSearchCV object with a smaller number of iterations
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    verbose=2,
    random_state=42
)

# Perform the random search
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy score found: ", random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found:  {'subsample': 1.0, 'reg_lambda': 0.25, 'reg_alpha': 0.30000000000000004, 'n_estimators': 350, 'min_child_weight': 8, 'max_depth': 9, 'learning_rate': 0.060000000000000005, 'gamma': 0.2, 'colsample_bytree': 0.8}
Best accuracy score found:  0.7428476682365057


After Grid Search Algorithm nearly taking 10 hours to be done, we have decided to immediately start with a Randomized Search (as it is a little faster than Grid Search) for the next model we will be trying which is tensorflow neural network.

In [None]:
from sklearn.neural_network import MLPClassifier

# Define the parameter space for the randomized search
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['relu'],
    'solver': ['adam', 'sgd'],
    'alpha': np.logspace(-5, -1, 5),
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': np.logspace(-5, -1, 5),
    'max_iter': [500, 1000, 2000]
}

# Create an MLPClassifier model
mlp = MLPClassifier(random_state=42)

# Perform the randomized search
random_search = RandomizedSearchCV(mlp, param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)


Best parameters found:  {'solver': 'adam', 'max_iter': 2000, 'learning_rate_init': 0.001, 'learning_rate': 'invscaling', 'hidden_layer_sizes': (50,), 'alpha': 0.01, 'activation': 'relu'}
Best accuracy found:  0.7115951832300221


After that Grid Search has outputted a 74.2 accuracy with certain hyperparameters, we have decided to plug in these parameters and tune them manually until we reach a better accuracy. After tuning the parameters the best we can, we have reached the following optimal model:

In [None]:
boostmodel = XGBClassifier( eval_metric='logloss', n_estimators=1000, max_depth=14, learning_rate=0.06, min_child_weight=7, gamma=8, subsample=1, reg_lambda = 15, reg_alpha = 15)
boostmodel.fit(X_train, y_train)

y_boost_train_pred = boostmodel.predict(X_train)

accuracy_boost_train = accuracy_score(y_train, y_boost_train_pred)

print("Training set performance:")
print(f"Accuracy: {accuracy_boost_train}")

# Predict the target values for the validation set
y_boost_cross_val_pred = boostmodel.predict(X_cross_val)

# Calculate the accuracy for the validation set
accuracy_boost_cross_val = accuracy_score(y_cross_val, y_boost_cross_val_pred)

print("Validation set performance:")
print(f"Accuracy: {accuracy_boost_cross_val}")

# Predict the target values for the testing set
y_boost_test_pred = boostmodel.predict(X_test)

# Calculate the accuracy for the testing set
accuracy_boost_test = accuracy_score(y_test, y_boost_test_pred)

print("\nTesting set performance:")
print(f"Accuracy: {accuracy_boost_test}")


Training set performance:
Accuracy: 0.7751327828702795
Validation set performance:
Accuracy: 0.7471628838451269

Testing set performance:
Accuracy: 0.7444425901201602


After that Randomized Search has outputted a 71.1 accuracy with certain hyperparameters, we have decided to plug in these parameters and tune them manually until we reach a better accuracy. After tuning the parameters the best we can, we have reached the following optimal model:

In [None]:
def custom_loss(y_true, y_pred):
    weights = tf.constant([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0 , 3.0, 3.0, 3.0, 3.0, 3.0])  # higher weight for the second feature
    weighted_difference = tf.math.multiply(weights, tf.math.subtract(y_true, y_pred))
    loss = tf.math.reduce_mean(tf.math.square(weighted_difference))
    return loss

model = tf.keras.Sequential([
  tf.keras.layers.Dense(900, activation='relu'),
  tf.keras.layers.Dense(700, activation='relu'),
  tf.keras.layers.Dense(500, activation='relu'),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss=custom_loss, metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_cross_val, y_cross_val))

# Evaluate the model on the train, cross validation and test data
train_loss, train_accuracy = model.evaluate(X_train, y_train)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
cv_loss, cv_accuracy = model.evaluate(X_cross_val, y_cross_val)

# Print the test accuracy

print('Train accuracy:', train_accuracy)
print('Cross Val accuracy:', cv_accuracy)
print('Test accuracy:', test_accuracy)