In [None]:
import numpy as np 
import pandas as pd 
pd.plotting.register_matplotlib_converters()
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Bidirectional,GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam,SGD,RMSprop
from sklearn.model_selection import train_test_split
import optuna
import time
import string
import os

In [None]:
full_list=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if "Articles" in filename : 
            full_list.append(pd.read_csv(os.path.join(dirname, filename)))
data= pd.concat(full_list)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(70,70))
plt.title("Headline categories")
plt.xlabel("Head line")
plt.ylabel("Number of articles")
sns.barplot(x=data["newDesk"],y=data.index)

In [None]:
data.isnull().sum()

In [None]:
data=data["headline"]

In [None]:
data.head()

In [None]:
def text_preprocessing(txt):
    txt="".join(c for c in txt if c not in string.punctuation).lower().strip()
    txt.encode('utf8','ignore')
    return txt

In [None]:
data=data.apply(lambda x:text_preprocessing(x))

In [None]:
data.head()

In [None]:
def remove(txt):
    if txt is not "unknow":
        return txt
data=data.apply(lambda x:remove(x))

In [None]:
data.shape

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(data)

In [None]:
list(tokenizer.word_index.keys())[:10]

In [None]:
v_size=len(tokenizer.word_index)+1

In [None]:
def n_grams(data):  
    seq= []
    for line in data:
        tokens= tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(tokens)):
            n_gram= tokens[:i+1]
            seq.append(n_gram)
    return seq

In [None]:
seq=n_grams(data)
seq[:10]

In [None]:
max_seq_len=max([len(x) for x in seq])
max_seq_len

In [None]:
seq = np.array(pad_sequences(seq,maxlen=max_seq_len,padding="pre"))

In [None]:
X=seq[:,:-1]
y=seq[:,-1]

In [None]:
X

In [None]:
y=to_categorical(y,num_classes=v_size)

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
list(y_test)[:1]

In [None]:
def objective(trial):
    EMBEDDING_SIZE=trial.suggest_int("embedding_size",10,100,step=5)
    EPOCHS = trial.suggest_int("epochs", 10,100, step=10)
    L2 = trial.suggest_float("l", 1e-5, 1e-2, log=True)
    LR = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    BATCH_SIZE = trial.suggest_int("batch_size", 16, 64, step=8)
    OPT = trial.suggest_categorical("optimizer", [Adam, SGD, RMSprop])
    DROPOUT=trial.suggest_float("dropout",0.1,0.3,step=0.5)
    size1=trial.suggest_int("size1", 400,500, step=10)
    size2=trial.suggest_int("size2", 300,400, step=10)
    size3=trial.suggest_int("size3", 200,300, step=10)
    size4=trial.suggest_int("size4", 100,200, step=10)
    model= Sequential()
    model.add(Embedding(v_size,EMBEDDING_SIZE,input_length=max_seq_len-1))
    model.add(LSTM(size1, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(LSTM(size2, return_sequences=True))
    model.add(Dropout(DROPOUT))
    model.add(LSTM(size3))
    model.add(Dropout(DROPOUT))
    #model.add(Bidirectional(LSTM(size4)))
    #model.add(Dropout(DROPOUT))
    #model.add(Dense(v_size/2, activation='relu', kernel_regularizer=regularizers.l2(l=L2)))
    model.add(Dense(v_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=OPT(lr=LR), metrics=['accuracy'])
    callback=EarlyStopping(monitor="val_accuracy")
    model.fit(X_train,y_train,epochs=EPOCHS,verbose=1, batch_size=BATCH_SIZE,callbacks=[callback])
    val_loss, val_acc = model.evaluate(X_test,y_test)
    return val_loss

In [None]:
study = optuna.create_study()
start = time.time()
study.optimize(objective, n_trials=5)
end = time.time()

In [None]:
best_params = study.best_params
print(best_params)
print("model took %0.2f seconds to train" % (end - start))

In [None]:
model=Sequential([
])

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=6.448221393305715e-05), metrics=['accuracy'])
callback=EarlyStopping(monitor="val_accuracy")
model.fit(X_train,y_train,epochs=150,verbose=1, batch_size=64,callbacks=[callback])

In [None]:
def generate_text(txt, next_seq, model, max_seq_len):
    for _ in range(next_seq):
        tokens = tokenizer.texts_to_sequences([txt])
        tokens = pad_sequences([tokens], maxlen=max_seq_len-1, padding='pre')
        pred = model.predict_classes(tokens, verbose=1)
        output_word = ""
        for word,i in tokenizer.word_index.items():
            if i == pred:
                output_word = word
                break
        txt += " "+output_word
    return txt

In [None]:
pred=generate_text("NewYork", 10, model, max_seq_len)

In [None]:
pred