# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import csv

from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

2022-12-23 12:52:41.220480: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas import MultiIndex, Int64Index


In [2]:
tf.__version__

'2.10.0'

# EDA

I first read both *.csv* and then explore the fields.

In [3]:
df_train = pd.read_csv('data/space_X_train.csv')
df_test = pd.read_csv('data/space_X_test.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,target
0,0,0.474335,0.88164,-2.050543,-1.251935,-1.035151,-1.934367,4
1,1,-1.034675,1.741801,-1.660629,-1.555989,-0.337553,-2.473838,0
2,2,-0.563221,-0.688381,-0.318415,-1.845172,0.352366,-0.912928,1
3,3,-1.268179,2.770688,1.054193,2.830389,0.395093,0.677715,3
4,4,-1.21638,-0.391267,-2.898931,-0.913074,-2.171857,-2.36749,0


In [4]:
ProfileReport(df_train)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



I create three dataframes:
- *X_train_full*: Contains the sensor fields with all the rows of the train file.
- *y_train_full*: Contains the target field with all the rows of the train file.
- *X_test*: Contains the sensor fields with all rows of the test file.

Then I standardize the *X_train_full* and *X_test*.

In [5]:
X_train_full = df_train.iloc[:,1:-1]
y_train_full = df_train.iloc[:,-1]
X_test = df_test.iloc[:,1:]

scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_full)
X_test = scaler.transform(X_test)

X_train_full

array([[ 0.43997585,  0.42288961, -1.15322059, -0.72510113, -0.56536308,
        -1.263044  ],
       [-0.53784019,  0.95594575, -0.88435133, -0.9359739 , -0.10067423,
        -1.7169496 ],
       [-0.23234483, -0.55007913,  0.04118381, -1.13653354,  0.35889932,
        -0.40361484],
       ...,
       [-0.41661369,  0.83000264,  0.92438802,  1.50698397,  0.3772632 ,
         0.98538328],
       [ 1.0867316 ,  0.61255705,  0.96118082, -0.62217069,  1.18113707,
         0.51812659],
       [ 0.07851461, -0.8639059 , -0.00762927, -1.49988438,  0.37891432,
        -0.39293648]])

I split *X_train_full* and *y_train_full* to create four dataframes:
- *X_train* and *y_train*: With these dataframe I train the model for the evalation.
- *X_evaluate* and *y_evaluate*: With these dataframes I calculate the metrics of the models.

In [6]:
X_train, X_evaluate, y_train, y_evaluate = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=123)
X_train.shape[0]

1575

# Model selection
I create different models to compare which one achives a better score.

### Random forest

In [7]:
rf = RandomForestClassifier(max_depth=2, random_state=123)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_evaluate)
print(classification_report(y_evaluate, rf_pred))

              precision    recall  f1-score   support

           0       0.51      0.73      0.60       105
           1       0.59      0.47      0.53       114
           2       0.58      0.20      0.30       110
           3       0.58      0.62      0.60       101
           4       0.60      0.85      0.70        95

    accuracy                           0.57       525
   macro avg       0.57      0.58      0.55       525
weighted avg       0.57      0.57      0.54       525



### Decision Tree

In [8]:
dt = DecisionTreeClassifier(random_state=123)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_evaluate)
print(classification_report(y_evaluate, dt_pred))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76       105
           1       0.70      0.60      0.64       114
           2       0.65      0.62      0.64       110
           3       0.66      0.69      0.68       101
           4       0.67      0.74      0.70        95

    accuracy                           0.68       525
   macro avg       0.68      0.69      0.68       525
weighted avg       0.68      0.68      0.68       525



### Support Vector Machine (SVM)

In [9]:
svm = SVC(gamma='auto', random_state=123)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_evaluate)
print(classification_report(y_evaluate, svm_pred))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76       105
           1       0.74      0.75      0.75       114
           2       0.72      0.65      0.69       110
           3       0.68      0.75      0.71       101
           4       0.73      0.80      0.76        95

    accuracy                           0.73       525
   macro avg       0.74      0.74      0.73       525
weighted avg       0.74      0.73      0.73       525



### Naives Bayes

In [10]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_evaluate)
print(classification_report(y_evaluate, gnb_pred))

              precision    recall  f1-score   support

           0       0.53      0.70      0.61       105
           1       0.63      0.47      0.54       114
           2       0.54      0.27      0.36       110
           3       0.48      0.66      0.56       101
           4       0.64      0.71      0.67        95

    accuracy                           0.56       525
   macro avg       0.56      0.56      0.55       525
weighted avg       0.56      0.56      0.54       525



### XGBoost

In [11]:
bst = XGBClassifier(random_state=123)
bst.fit(X_train, y_train)
preds = bst.predict(X_evaluate)
print(classification_report(y_evaluate, gnb_pred))





              precision    recall  f1-score   support

           0       0.53      0.70      0.61       105
           1       0.63      0.47      0.54       114
           2       0.54      0.27      0.36       110
           3       0.48      0.66      0.56       101
           4       0.64      0.71      0.67        95

    accuracy                           0.56       525
   macro avg       0.56      0.56      0.55       525
weighted avg       0.56      0.56      0.54       525



### Artificial Neural Network (ANN)

In [12]:
tf.random.set_seed(123)

ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=50, input_shape=(6,), activation='relu'))
ann.add(tf.keras.layers.Dense(units=50, activation='relu'))
ann.add(tf.keras.layers.Dense(units=50, activation='relu'))
ann.add(tf.keras.layers.Dense(units=5, activation='softmax'))
ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = ann.fit(X_train, tf.keras.utils.to_categorical(y_train), batch_size=32, epochs=100)
scores = ann.evaluate(X_evaluate, tf.keras.utils.to_categorical(y_evaluate), verbose=False)

Epoch 1/100


2022-12-23 12:52:55.499305: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-23 12:52:55.499591: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [13]:
print(ann.metrics_names[1], scores[1]*100)
print(ann.metrics_names[2], scores[2]*100)
print(ann.metrics_names[3], scores[3]*100)

accuracy 78.09523940086365
precision 79.7595202922821
recall 75.80952644348145


In [14]:
ann_predict = np.argmax(ann.predict(X_evaluate, verbose=False),axis=1)
print(classification_report(y_evaluate, ann_predict))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82       105
           1       0.75      0.78      0.77       114
           2       0.72      0.72      0.72       110
           3       0.76      0.80      0.78       101
           4       0.79      0.86      0.82        95

    accuracy                           0.78       525
   macro avg       0.79      0.78      0.78       525
weighted avg       0.78      0.78      0.78       525



### Model tuning

The best model is the *ANN*. The next step is to search the best parameters to solve this problem.

#### Tune the number of neurons in the hidden layer

In [15]:
def create_model(neurons):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(neurons, input_shape=(6,), activation='relu'))
    model.add(tf.keras.layers.Dense(neurons, activation='relu'))
    model.add(tf.keras.layers.Dense(neurons, activation='relu'))
    model.add(tf.keras.layers.Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(model=create_model, epochs=100, batch_size=32, verbose=False)
neurons = [15, 30, 50, 70, 85, 100, 115, 130, 150, 175, 200]
param_grid = dict(model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train_full, tf.keras.utils.to_categorical(y_train_full))

print('Best: ', grid_result.best_score_, 'using ', grid_result.best_params_)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

2022-12-23 12:53:29.567476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-23 12:53:29.617537: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-23 12:53:29.634814: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable th

Best:  0.7985714285714286 using  {'model__neurons': 150}
0.757143 (0.018626) with: {'model__neurons': 15}
0.797143 (0.020942) with: {'model__neurons': 30}
0.793810 (0.020952) with: {'model__neurons': 50}
0.792381 (0.020169) with: {'model__neurons': 70}
0.785714 (0.021977) with: {'model__neurons': 85}
0.794286 (0.016673) with: {'model__neurons': 100}
0.782857 (0.016316) with: {'model__neurons': 115}
0.789524 (0.011025) with: {'model__neurons': 130}
0.798571 (0.020625) with: {'model__neurons': 150}
0.787143 (0.017856) with: {'model__neurons': 175}
0.783333 (0.013214) with: {'model__neurons': 200}


#### Tune batch size and training epochs

In [16]:
def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(150, input_shape=(6,), activation='relu'))
    model.add(tf.keras.layers.Dense(150, activation='relu'))
    model.add(tf.keras.layers.Dense(150, activation='relu'))
    model.add(tf.keras.layers.Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(model=create_model, verbose=False)
batch_size = [8, 16, 32, 64, 128]
epochs = [10, 25, 50, 75, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train_full, tf.keras.utils.to_categorical(y_train_full))

print('Best: ', grid_result.best_score_, 'using ', grid_result.best_params_)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best:  0.800952380952381 using  {'batch_size': 16, 'epochs': 50}
0.758095 (0.020952) with: {'batch_size': 8, 'epochs': 10}
0.784286 (0.023212) with: {'batch_size': 8, 'epochs': 25}
0.792857 (0.020426) with: {'batch_size': 8, 'epochs': 50}
0.780000 (0.018418) with: {'batch_size': 8, 'epochs': 75}
0.770952 (0.022152) with: {'batch_size': 8, 'epochs': 100}
0.758571 (0.016673) with: {'batch_size': 16, 'epochs': 10}
0.788095 (0.014046) with: {'batch_size': 16, 'epochs': 25}
0.800952 (0.014877) with: {'batch_size': 16, 'epochs': 50}
0.781905 (0.012472) with: {'batch_size': 16, 'epochs': 75}
0.783810 (0.021157) with: {'batch_size': 16, 'epochs': 100}
0.761905 (0.013719) with: {'batch_size': 32, 'epochs': 10}
0.785714 (0.025951) with: {'batch_size': 32, 'epochs': 25}
0.789524 (0.013603) with: {'batch_size': 32, 'epochs': 50}
0.785714 (0.018070) with: {'batch_size': 32, 'epochs': 75}
0.786667 (0.007619) with: {'batch_size': 32, 'epochs': 100}
0.742857 (0.017431) with: {'batch_size': 64, 'epochs

#### Tune network weight initialization

In [17]:
def create_model(init_mode='uniform'):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(150, kernel_initializer=init_mode, input_shape=(6,), activation='relu'))
    model.add(tf.keras.layers.Dense(150, kernel_initializer=init_mode, activation='relu'))
    model.add(tf.keras.layers.Dense(150, kernel_initializer=init_mode, activation='relu'))
    model.add(tf.keras.layers.Dense(5, kernel_initializer=init_mode, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasClassifier(model=create_model, epochs=50, batch_size=16, verbose=False)
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(model__init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train_full, tf.keras.utils.to_categorical(y_train_full))

print('Best: ', grid_result.best_score_, 'using ', grid_result.best_params_)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best:  0.7966666666666666 using  {'model__init_mode': 'normal'}
0.789048 (0.023015) with: {'model__init_mode': 'uniform'}
0.780000 (0.026581) with: {'model__init_mode': 'lecun_uniform'}
0.796667 (0.019783) with: {'model__init_mode': 'normal'}
0.180000 (0.008053) with: {'model__init_mode': 'zero'}
0.790952 (0.008703) with: {'model__init_mode': 'glorot_normal'}
0.788571 (0.015894) with: {'model__init_mode': 'glorot_uniform'}
0.777143 (0.010712) with: {'model__init_mode': 'he_normal'}
0.780952 (0.016148) with: {'model__init_mode': 'he_uniform'}


# Evaluation

I train the *ANN* again with the parameters and calculate the metrics.

In [18]:
final_model = tf.keras.models.Sequential()
final_model.add(tf.keras.layers.Dense(units=150, kernel_initializer='normal', input_shape=(6,), activation='relu'))
final_model.add(tf.keras.layers.Dense(units=150, kernel_initializer='normal', activation='relu'))
final_model.add(tf.keras.layers.Dense(units=150, kernel_initializer='normal', activation='relu'))
final_model.add(tf.keras.layers.Dense(units=5, activation='softmax'))
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = final_model.fit(X_train, tf.keras.utils.to_categorical(y_train), batch_size=16, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
scores = final_model.evaluate(X_evaluate, tf.keras.utils.to_categorical(y_evaluate), verbose=False)
print(final_model.metrics_names[1], scores[1]*100)
print(final_model.metrics_names[2], scores[2]*100)
print(final_model.metrics_names[3], scores[3]*100)

accuracy 78.85714173316956
precision_1 80.79208135604858
recall_1 77.7142882347107


In [20]:
evaluation_predict = np.argmax(final_model.predict(X_evaluate, verbose=False),axis=1)
print(classification_report(y_evaluate, evaluation_predict))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       105
           1       0.85      0.71      0.78       114
           2       0.75      0.70      0.72       110
           3       0.74      0.84      0.79       101
           4       0.77      0.86      0.82        95

    accuracy                           0.79       525
   macro avg       0.79      0.79      0.79       525
weighted avg       0.79      0.79      0.79       525



The *F1-score (macro)* is ***0.79***.

# Solution

Now I train the model with *X_train_full* and *y_train_full* which contains all the information from the train file and generate the solution with the dataframe *X_test* which contains all the rows from the test file.

In [21]:
final_hist = final_model.fit(X_train_full, tf.keras.utils.to_categorical(y_train_full), batch_size=16, epochs=50)
final_predict = np.argmax(final_model.predict(X_test, verbose=False),axis=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [22]:
f = open('predictions.csv', 'w')
writer = csv.writer(f)

writer.writerow(['final_status'])
for pred in final_predict:
    writer.writerow(str(pred))

f.close()