In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold



In [2]:
def null_vals(dataframe):
    '''function to show both number of nulls and the percentage of nulls in the whole column'''
    null_vals = dataframe.isnull().sum()
    total_cnt = len(dataframe)
    null_vals = pd.DataFrame(null_vals,columns=['null'])
    null_vals['percent'] = round((null_vals['null']/total_cnt)*100,3)
    return null_vals.sort_values('percent', ascending=False)


Read in Data and Check Formatting

In [3]:
df = pd.read_csv('train.csv')

FileNotFoundError: ignored

In [None]:
df

In [None]:
null_vals(df)

No nulls

In [None]:
df.dtypes

In [None]:
(df.dtypes == 'int64').sum()

Only 1 non numeric column which is target class

In [None]:
df.columns

In [None]:
#return all feature columns
feature_cols = df.columns[1:-1]

In [None]:
X = df[feature_cols]
#standard scaler for pca
X = (X-X.mean())/X.std(ddof=0)

y=df['target']
LE = LabelEncoder()
y = LE.fit_transform(y)
y = to_categorical(y)

In [None]:
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(df.corr(), dtype=np.bool))
heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, cmap='icefire')
heatmap.set_title('Correlations Heatmap', fontdict={'fontsize':18}, pad=16)

no visible relevant correlations

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('principle components')
plt.ylabel('explained varience')

It appears the data has already been through PCA so shouldnt be reduced further.

In [None]:
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

In [None]:
means = df.groupby('target')[df.columns[1:-1]].mean()
plt.figure(figsize = (15,6))
means.T.plot(figsize = (15,6), title='Mean Feature Values per Class',
             rot=90)


There appears to be some seperation between the classes for their average values

In [None]:
# Data to plot
plt.style.use("dark_background")
plt.figure(figsize=(10,10))
labels = df.target.value_counts().index
sizes = df.target.value_counts().values
colors = ['#003f5c','#2f4b7c', '#665191','#a05195', '#d45087','#f95d6a', '#ff7c43','#ffa600','#665191']

# Plot
plt.pie(sizes, colors=colors, startangle=90,frame=False,autopct='%.1f%%', pctdistance=0.75)
centre_circle = plt.Circle((0,0),0.5,color='black', fc='black',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.style.use("dark_background")
plt.axis('equal')
plt.tight_layout()
plt.title('Class Distribution',fontsize=20)
plt.legend(labels, bbox_to_anchor=(0.9,1), loc="upper left")
#plt.savefig('brakedown.png',bbox_inches='tight')
plt.show()

In [None]:
df.target.value_counts()

Classes are imbalenced however the model performs significantly worse on initial inspection using over and undersampling which i will leave in for completeness

In [None]:
from sklearn.preprocessing import MinMaxScaler
#minmax
X = df[feature_cols]
scaler = MinMaxScaler()

upsample using smote

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19930609, stratify=y)

#Oversample
#oversample = SMOTE(sampling_strategy='auto',random_state=101,k_neighbors=150, n_jobs=-1)
#X_over, y_over = oversample.fit_resample(X_train,y_train)

#Undersample
#undersample = NearMiss(version=2)
#X_under, y_under = undersample.fit_resample(X_train,y_train)

In [None]:
vals = [y_train,y_test]#,y_over,y_under]
names = ['y_train','y_test']#,'y_over','y_under']
for i, j in zip(vals, names):
  y_arg = np.argmax(i, axis=1)
  unique, counts = np.unique(y_arg, return_counts=True)
  print(f'target class occurences for: {j}')
  print(np.asarray((unique, counts)).T)
  print('='*40)

In [None]:
!pip install tensorflow_addons
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Embedding, Flatten, BatchNormalization, InputLayer
from tensorflow.keras.losses import CategoricalCrossentropy

In [None]:
print(X.max().tail(50))

In [None]:
model = Sequential()

# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

model.add(InputLayer(input_shape=(75,)))

model.add(Embedding(380, 16, input_length = 128))
model.add(Flatten())

model.add(Dense(units=128,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=9,activation='softmax'))

cat_cross = CategoricalCrossentropy(from_logits=False,
                                    label_smoothing=0,
                                    name='categorical_crossentropy')

adam = tfa.optimizers.AdamW(weight_decay=1e-7,
                            learning_rate=0.0001,
                            amsgrad=True,
                            name='AdamW')

model.compile(loss=cat_cross, optimizer=adam)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

In [None]:
model.fit(x=X_train,
          y=y_train,
          epochs=600,
          validation_data=(X_test, y_test),
          verbose=1,
          callbacks=[early_stop])

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

In [None]:
#from sklearn.metrics import classification_report,confusion_matrix
#predictions = model.predict(X_test)
#predictions
#print(classification_report(y_test,predictions))
#print(confusion_matrix(y_test,predictions))

In [None]:
final = pd.read_csv('test.csv')
X2=final[final.columns[1:]]

In [None]:
predictions = pd.DataFrame(model.predict(X2), columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
predictions['id'] = final['id']
predictions.set_index('id',inplace=True)
predictions.to_csv('submission_NN_final.csv',header=True,index=True)