In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_df = pd.read_csv('All_numbers.csv')
train_df.head()

In [None]:
# Replace nan with most occuring
train_df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
## Create our Text Vectorizer to index our vocabulary based on the train samples 
import tensorflow as tf
from keras.layers import TextVectorization

def vectorize_data(data):
    vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=1)
    text_ds = tf.data.Dataset.from_tensor_slices(data).batch(128) ## Read batches of 128 samples
    vectorizer.adapt(text_ds)
    ## Create a map to get the unique list of the vocabulary
    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    
    x_train_data = vectorizer(np.array([[s] for s in data])).numpy()
    return x_train_data
    



In [None]:
list_strings = ["City", "County", "State", "Timezone", "Wind_Direction", "Weather_Condition"]

# for column in list_strings:
#   print("Hello")
#   counties = vectorize_data(train_df[column])
#   train_df[column] = counties[:, 0]

In [None]:
# def process_binary(column, class1, class2):
#   train_df.loc[train_df[column] == class1, column] = 0
#   train_df.loc[train_df[column] == class2, column] = 1

list_binary = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop']

#list_strings = ["City", "County", "State", "Timezone", "Wind_Direction", "Weather_Condition"]

# for column in list_binary:
#   print("Hello")
#   counties = vectorize_data(train_df[column])
#   train_df[column] = counties[:, 0]

In [None]:
train_df.head()

In [None]:
day_night_columns=['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']
# for column in day_night_columns:
#   print("Hello")
#   counties = vectorize_data(train_df[column])
#   train_df[column] = counties[:, 0]

In [None]:
print(np.unique(np.array(train_df['Station'])))

In [None]:
1000 in train_df['Station']

In [None]:
# column = 'Side'
# counties = vectorize_data(train_df[column])
#train_df[column] = counties[:, 0]

In [None]:
# train_df.to_csv("All_numbers.csv")

In [None]:
train_df.drop('ID', axis=1, inplace=True)
train_df.drop('Country', axis=1, inplace=True)

In [None]:
most_nan_columns = ['Number','Wind_Chill(F)','Humidity(%)','Wind_Speed(mph)','Precipitation(in)']

In [None]:
train_df.drop('Wind_Chill(F)', axis=1, inplace=True)
train_df.drop('Humidity(%)', axis=1, inplace=True)
train_df.drop('Wind_Speed(mph)', axis=1, inplace=True)
train_df.drop('Precipitation(in)', axis=1, inplace=True)

In [None]:
y = train_df["Severity"]
y = y-1
np.unique(y)

# Standardize

In [None]:
print(np.unique(train_df["Bump"]))

In [None]:
import copy

frames = copy.deepcopy(train_df)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

for column in train_df:
  print(column)
  scaler = MinMaxScaler()
  data = np.array(train_df[column])
  data = data.reshape(data.shape[0], 1)
  scaler.fit(data)
  #print(scaler.data_max_)
  data = scaler.transform(data)
  print(np.max(data))
  train_df[column] = data


# Train and val Data processing

In [None]:
train_df.head()

In [None]:
train_df.drop('Severity', axis=1, inplace=True)

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=1)

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from keras.layers import Dense, BatchNormalization, Dropout
from keras.models import Sequential

model = Sequential()
model.add(Dense(20, input_shape = (33,), activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(4, activation="softmax"))

In [None]:


checkpoint_callbk = tf.keras.callbacks.ModelCheckpoint(
    "best_tiny_model", # name of file to save the best model to
    monitor="val_macroF1", # prefix val to specify that we want the model with best macroF1 on the validation data
    verbose=1, # prints out when the model achieve a better epoch
    mode="max", # the monitored metric should be maximized
    save_freq="epoch", # clear
    save_best_only=True, # of course, if not, every time a new best is achieved will be savedf differently
    save_weights_only=True # this means that we don't have to save the architecture, if you change the architecture, you'll loose the old weights
)

In [None]:
#!pip install tensorflow_addons

In [None]:

opt = tf.keras.optimizers.Adam(learning_rate=0.00001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128, verbose=1)

In [None]:
# test_value = np.array(X_test.iloc[1])
# test_value = np.reshape(test_value, (1, 37))
# model.predict(test_value)
# # test_value

# Predict

In [None]:
predictions = model.predict(X_test)

In [None]:
print(len(predictions))

In [None]:
print(predictions[0])

In [None]:
print(np.unique(predictions))
print(predictions.shape)

In [None]:
max_index = predictions.argmax(axis=1)
print(max_index.shape)

In [None]:
print(max_index[0])

In [None]:
preds = np.add(max_index,np.ones(max_index.shape[0]))

In [None]:
print(np.unique(preds))

In [None]:
# preds = []

# print(len(X_test["State"]))
# for i in range(len(X_test["State"])):
#   if(i%10000 == 0):
#     print(i)
#   test_value = np.array(X_test.iloc[i])
#   test_value = np.reshape(test_value, (1, 33))
#   pred = np.argmax(model.predict(test_value)) + 1
#   preds.append(pred)

In [None]:
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

print("Accuracy:",metrics.accuracy_score(y_test, preds))
print("F1 score:",metrics.f1_score(y_test, preds, average='weighted'))

In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label = "Train accuracy")
plt.plot(history.history["loss"], label = "Train loss")
plt.plot(history.history["val_loss"], label = "Validation Loss")
plt.legend()
plt.show()