In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import random
import warnings

from tensorflow import keras as ks
from tensorflow.keras import layers
from keras.models import load_model

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
def clean_data(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z0-9)\s\*\+-\/\(\)=&|]','', text)
    return text

train = pd.read_csv("/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv")
valid = pd.read_csv("/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv")

maxlength = 75

train['Body'] = train['Body'].apply(clean_data)
valid['Body'] = valid['Body'].apply(clean_data)
    
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', lower=True, split=" ")
tokenizer.fit_on_texts(train['Body'])
train_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['Body']), maxlen=maxlength)
valid_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(valid['Body']), maxlen=maxlength)

train_y = train['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})
valid_y = valid['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})

train_y = ks.utils.to_categorical(train_y, num_classes=3)
valid_y = ks.utils.to_categorical(valid_y, num_classes=3)

model = ks.Sequential()
model.add(layers.Embedding(input_length=maxlength, input_dim=10000, output_dim=128))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.LSTM(64))
model.add(layers.Dense(3, activation="softmax"))
model.summary()

model.compile(optimizer=ks.optimizers.SGD(learning_rate=(0.55)), loss=ks.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=128, epochs=25)
loss, acc = model.evaluate(valid_x, valid_y, verbose=1)
print('Loss:\t\t', loss, '\nAccuracy:\t', acc

/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv
/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv
/kaggle/input/stackoverflow/cleaned_train.csv
/kaggle/input/stackoverflow/cleaned_valid.csv
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 75, 128)           1280000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 75, 64)            49408     
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 1,362,627
Trainable params: 1,362,627
Non-trainable params: 0
_________________________________________________________________
Epoch 1/

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:		 0.5160810351371765 
Accuracy:	 0.6659333109855652
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 75, 128)           1280000   
_________________________________________________________________
lstm_18 (LSTM)               (None, 75, 64)            49408     
_________________________________________________________________
lstm_19 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 195       
Total params: 1,362,627
Trainable params: 1,362,627
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:		 0.4548921287059784 
Accuracy:	 0.6972666382789612
Model: "sequential_10"
_____________