In [319]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from keras.utils import to_categorical

In [320]:
print(tf.__version__)

2.10.1


In [321]:
#loading the training data
data = pd.read_csv("train.csv")
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

x = np.array(x, dtype='float')
y = np.array(y)

num_classes = len(set(y))

In [322]:
#converting strings to numeric labels
label = 0
labels = {}
inverse_labels = {}
for i in set(y):
    labels[i] = label
    inverse_labels[label] = i
    label += 1

for i in range(len(y)):
    y[i] = labels[y[i]]

y = np.array(y, dtype='float')

In [323]:
#loading the testing data
x_test = pd.read_csv("test.csv").iloc[:, 1:].values

x_test = np.array(x_test, dtype='float')

In [324]:
#standardizing the data
x = (x - np.mean(x, axis=0))/np.std(x)
x_test = (x_test - np.mean(x_test, axis=0))/np.std(x_test)

In [325]:
#performing dimensionality reduction
pca = PCA(n_components=200)
p = pca.fit(x)

x = p.transform(x)
x_test = p.transform(x_test)

In [326]:
#performing lda
lda = LinearDiscriminantAnalysis(n_components=19)
l = lda.fit(x, y)

x = l.transform(x)
x_test = l.transform(x_test)

In [327]:
#clearing previous tensorflow session
tf.keras.backend.clear_session()

In [328]:
#configuring the model
model = tf.keras.models.Sequential(
    [
    tf.keras.layers.Flatten(input_shape=(x.shape[1],)),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax),
    ]
)

In [329]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [330]:
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)

cv_scores = []


for train_idx, test_idx in kfold.split(x):
    x_train, y_train = x[train_idx], y[train_idx]
    x_val, y_val = x[test_idx], y[test_idx]

    # Fit the model to the training data for this fold
    model.fit(x_train, y_train, epochs=10, batch_size=32, verbose=0)

    _, accuracy = model.evaluate(x_val, y_val, verbose=0)
    cv_scores.append(accuracy)

print('Cross-validation accuracy scores:', cv_scores)
print('Mean cross-validation accuracy:', np.mean(cv_scores))
print('Standard deviation of cross-validation accuracy:', np.std(cv_scores))

Cross-validation accuracy scores: [0.9221311211585999, 0.9465020298957825, 0.9629629850387573, 0.9876543283462524, 0.9958847761154175]
Mean cross-validation accuracy: 0.963027048110962
Standard deviation of cross-validation accuracy: 0.02695066008017051


In [331]:
# model.fit(x, y, epochs=50, batch_size=4)

In [332]:
# predicted_probabilities = model.predict(x_test)

In [333]:
# y_pred = np.argmax(predicted_probabilities, axis=1)

# predicted_categories = []

# for i in range(len(y_pred)):
#     predicted_categories.append(inverse_labels[y_pred[i]])

# ids = range(415)

# previous_submission = pd.read_csv("submission.csv")
# previous_submission.to_csv('prev.csv', index=False)
# results = pd.DataFrame({'ID': ids, 'Category': predicted_categories})
# results.to_csv('submission.csv', index=False)