## LIBRARIES

In [None]:
!pip install pysrt
!pip install keras_tuner

In [None]:
import random , pandas as pd , numpy as np , re , pysrt , tensorflow as tf , gensim , keras_tuner as kt
from pathlib import Path
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential ,load_model
from tensorflow.keras.layers import  LSTM, Dense , Activation , Embedding, MaxPool1D ,GlobalMaxPool1D , Conv1D , BatchNormalization , InputLayer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

## LOADING DATA

In [None]:

folder_path = Path("/content/")
files = list(folder_path.glob('*'))
text = ''


for file_ in files:
    if str(file_)[-3:] == 'srt' :
        print(file_)
        subs = pysrt.open(file_)
        text_ = '\n'.join(sub.text for sub in subs)
        text = ' '.join([text,text_])

print(text)

print(type(text))

## PREPROCESSING

In [None]:
# Ran the loop twice beacuse treating subtitles of one film creates issues in other film subtiles ,so running twice helps.

for i in range(2):
    text = text.lower().replace('</i>', '').replace('â™ª', '').replace('<i>', '').replace('...','').replace(',',' ' ).replace('\'',"'").replace('\n',' ').replace('  ',' ').replace(' - ' , ' ').replace("--","").replace('"','').replace(".","").replace('wwwfacebookcom/englishlolchannel/','')
    pattern1 = r'\[.*?\]|\?'
    result = re.sub(pattern1, '', text)
    pattern2 = r'{.}'
    result = re.sub(pattern2, '', result)
    pattern3 = r"(\w+)'"
    result = re.sub(pattern3,'g',result)
    pattern4 = r'<.*?>'
    result = re.sub(pattern4,"",result)
    pattern5 = r'(\d+)[ ,]+(\d+)'
    result = re.sub(pattern5,"",result)
print(result)


In [None]:
#saving the processed text to have a good look

# with open("C:/Users/Lenovo/Desktop/next work/lafs.txt", "w") as file:
#     file.write(result)

## tokenizing the texts

In [None]:
# tokenization
tokenizer = RegexpTokenizer(r"\S+")
tokens = tokenizer.tokenize(result.lower())
print(tokens[:15])

print(len(tokens))
print(len(np.unique(tokens)))

In [None]:
unique_tokens = np.unique(tokens)
unique_tokens_index = {token : idx for idx , token in enumerate(unique_tokens)}
print(unique_tokens_index)

## creating the input and output words for the model similar to x and y

In [None]:
# preparing the data
nwords = 20
input_words = []
next_words = []

for i in range(len(tokens)-nwords):
    input_words.append(tokens[i : i+nwords])
    next_words.append(tokens[i+nwords])

print(input_words[69])
print(next_words[69])

print(len(input_words))
print(len(next_words))

In [None]:
print(input_words[0])

## coming up with X and Y in form of arrays
## Replacing all the words with their respective token numbers fro X
## creating a sparse categorical for Y

In [None]:
x = np.zeros((len(input_words) , nwords , len(unique_tokens)), dtype = 'int32')

y = np.zeros((len(next_words),len(unique_tokens)) , dtype='int32')

for i, sent_ in enumerate(input_words):
  for j,word_ in enumerate(sent_):
    x[i,j,unique_tokens_index[word_]] = 1
  y[i,unique_tokens_index[next_words[i]]] = 1

In [None]:
x

In [None]:
print(x.shape)
print(y.shape)

## model

In [None]:
def tuner_model(hp):
    model = Sequential()
    model.add(InputLayer(input_shape=(nwords , len(unique_tokens))))

    for i in range(hp.Int('num_layers', min_value=1, max_value=9)):

        units = hp.Int('units_'+str(i+1), min_value=16, max_value=256, step=16)
        activation = hp.Choice('activation_'+str(i+1), values=['relu', 'tanh'])
        return_sequences = True

        model.add(LSTM(units, activation=activation, return_sequences=return_sequences))

        if hp.Boolean('batch_norm_lstm_'+str(i+1)):
            model.add(BatchNormalization())

        if hp.Boolean('max_pooling_lstm_'+str(i+1)):
            model.add(GlobalMaxPool1D())




    units = hp.Int('units_'+str(-1), min_value=16, max_value=256, step=16)
    activation = hp.Choice('activation_'+str(i-1), values=['relu', 'tanh'])

    model.add(LSTM(units, activation=activation, return_sequences=False))

    if hp.Boolean('batch_norm_lstm_'+str(i-1)):
            model.add(BatchNormalization())

    if hp.Boolean('max_pooling_lstm_'+str(-1)):
            model.add(GlobalMaxPool1D())




    for i in range(hp.Int('num_layers', min_value=1, max_value=3)):

        model.add(Dense(
                        units = hp.Int('dense_'+str(i+1), min_value=0, max_value=256, step=32),
                        activation = hp.Choice('activation_'+str(i+1), values=['relu', 'tanh']))
                       )




    model.add(Dense(len(unique_tokens), activation='softmax'))

    optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop', 'adagrad', 'adadelta', 'nadam'])

    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model


## now tuning the model using keras tuner

In [None]:

tuner = kt.RandomSearch(tuner_model,objective = 'val_accuracy' , max_trials =  5 , directory = 'keras_tuner',project_name=  'tuner_model')

In [None]:

earlystop = EarlyStopping(monitor = 'val_loss',patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience = 3)

In [None]:
# lets say the model tuning encounters some error at 2nd epoch , then we need to remove the model tuning that happend till now

# !rm -r /content/keras_tuner/tuner_model

In [None]:
# tuner.search(x,y,epochs = 5 , validation_split=0.2 ,callbacks = [earlystop,reduce_lr])

# some times it happens that the tuning runs for more than 5 epochs for some reasons , so to avoid that we use for loop

max_trials = 5

for i in range(max_trials):
    tuner.search(x, y, epochs=5, validation_split=0.2, callbacks=[earlystop, reduce_lr],batch_size=64)


In [None]:
model1 = tuner.hypermodel.build(tuner.get_best_hyperparameters(num_trials=1)[0])
model1.summary()

In [None]:
# saving the best tuned model in case there is any issue like internet disconnection then we can resume from here.

# best_model = tuner.get_best_models(num_models=1)[0]
# best_model.save('best_model_search.h5')

In [None]:
# downloading the saved best tuned model

# from google.colab import files

# files.download('best_model_search.h5')


## model fitting

In [None]:
model1.fit(x,y,epochs = 100 , validation_split=0.2 ,callbacks = [earlystop,reduce_lr], batch_size=128,shuffle=True)

In [None]:
# model1.save('my_model.h5')

# from google.colab import files
# files.download('my_model.h5')

In [None]:
# from tensorflow.keras.models import load_model

# model = load_model('/content/best_model.h5')


## making predictions

In [None]:

def pred(input_ ,nbest):
    input_ = input_.lower()
    x = np.zeros((1,nwords,len(unique_tokens)))
    for i,word in enumerate(input_.split()):
        x[0,i,unique_tokens_index[word]] = 1

    predict_ = model.predict(x)[0]
    return np.argpartition(predict_ , -nbest)[-nbest:]


In [None]:
pp = pred('probably chose to be four minutes late',5)
print([unique_tokens[i] for i in pp])

## this gives the predictoins for next 5 words


In [None]:
## making predictions for next 100 words


def generate_text(input_text , text_length , creativity = 3):
    word_seq = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_seq =  " ".join(tokenizer.tokenize(" ".join(word_seq).lower())[current:current+nwords])
        choice = unique_tokens[random.choice(pred(sub_seq,creativity))]
        word_seq.append(choice)
        current+=1
    return " ".join(word_seq)


In [None]:

generate_text('probably chose to be four minutes late',100,5)
