# 1.Build a NLP Language model for text generation involves train a neural network to predict the next word in a sequence of words.

In [3]:
#import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [4]:
#Sample Data
text_data=["Peter Piper picked a peck of pickled peppers.","Betty Botter bought some butter but she said the butter’s bitter."]
text_data

['Peter Piper picked a peck of pickled peppers.',
 'Betty Botter bought some butter but she said the butter’s bitter.']

In [5]:
#Tokenization
tokenizer=tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words=len(tokenizer.word_index)+1
print(total_words)


20


In [6]:
#Create input sequences and targets
input_sequences=[]

for line in text_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
for i in range(1,len(token_list)):
    n_gram_sequence=token_list[:i+1]
    input_sequences.append(n_gram_sequence)
    
max_sequence_length=max([len(seq) for seq in input_sequences])
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
x,y=input_sequences[:,:-1],input_sequences[:,-1]
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [8]:
from sys import meta_path
#Build the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length-1)) # input embedd
model.add(LSTM(100)) #hidden layer
model.add(Dense(total_words,activation='softmax')) #output layer
#Compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#Train
model.fit(x,y,epochs=100,verbose=2)


Epoch 1/100
1/1 - 2s - loss: 2.9945 - accuracy: 0.1000 - 2s/epoch - 2s/step
Epoch 2/100
1/1 - 0s - loss: 2.9829 - accuracy: 0.3000 - 12ms/epoch - 12ms/step
Epoch 3/100
1/1 - 0s - loss: 2.9711 - accuracy: 0.3000 - 7ms/epoch - 7ms/step
Epoch 4/100
1/1 - 0s - loss: 2.9589 - accuracy: 0.3000 - 8ms/epoch - 8ms/step
Epoch 5/100
1/1 - 0s - loss: 2.9459 - accuracy: 0.3000 - 12ms/epoch - 12ms/step
Epoch 6/100
1/1 - 0s - loss: 2.9316 - accuracy: 0.3000 - 10ms/epoch - 10ms/step
Epoch 7/100
1/1 - 0s - loss: 2.9158 - accuracy: 0.3000 - 8ms/epoch - 8ms/step
Epoch 8/100
1/1 - 0s - loss: 2.8979 - accuracy: 0.3000 - 9ms/epoch - 9ms/step
Epoch 9/100
1/1 - 0s - loss: 2.8773 - accuracy: 0.3000 - 9ms/epoch - 9ms/step
Epoch 10/100
1/1 - 0s - loss: 2.8534 - accuracy: 0.3000 - 9ms/epoch - 9ms/step
Epoch 11/100
1/1 - 0s - loss: 2.8253 - accuracy: 0.3000 - 10ms/epoch - 10ms/step
Epoch 12/100
1/1 - 0s - loss: 2.7920 - accuracy: 0.3000 - 8ms/epoch - 8ms/step
Epoch 13/100
1/1 - 0s - loss: 2.7521 - accuracy: 0.3000

<keras.src.callbacks.History at 0x2091e445290>

In [11]:
#Generate text completion
seed_text="Peter Piper"
next_words=20
for _ in range(next_words):
    token_list=tokenizer.texts_to_sequences([seed_text])[0]
    token_list=tf.keras.preprocessing.sequence.pad_sequences([token_list],maxlen=max_sequence_length-1, padding='pre')
    predicted=np.argmax(model.predict(token_list,verbose=2))
output_word=""
for word,index in tokenizer.word_index.items():
    if index==predicted:
        output_word=word
    break
seed_text +=" "+output_word
print(seed_text)

1/1 - 0s - 23ms/epoch - 23ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 19ms/epoch - 19ms/step
1/1 - 0s - 31ms/epoch - 31ms/step
1/1 - 0s - 25ms/epoch - 25ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 19ms/epoch - 19ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 19ms/epoch - 19ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 21ms/epoch - 21ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 32ms/epoch - 32ms/step
1/1 - 0s - 23ms/epoch - 23ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 18ms/epoch - 18ms/step
Peter Piper 


In [12]:
seed_text = "Peter Piper picked "
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=2))
    
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    
    seed_text += " " + output_word

print(seed_text)

1/1 - 0s - 18ms/epoch - 18ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 24ms/epoch - 24ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 19ms/epoch - 19ms/step
1/1 - 0s - 20ms/epoch - 20ms/step
1/1 - 0s - 22ms/epoch - 22ms/step
1/1 - 0s - 19ms/epoch - 19ms/step
1/1 - 0s - 23ms/epoch - 23ms/step
Peter Piper picked  some butter but she said the butter’s bitter bitter bitter


In [14]:
seed_text = "Betty Botter bought some butter"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0))  
    
    # Instead of appending the predicted word, create a new sequence with the predicted word
    new_sequence = seed_text + " " + tokenizer.index_word[predicted]
    
    seed_text = new_sequence  # Update seed text for the next iteration

print(seed_text)

Betty Botter bought some butter but


# 2.Build a Speech to Text model.

In [15]:
import speech_recognition as sr

In [16]:
recog=sr.Recognizer()


In [17]:
?recog.record

In [18]:
dir(recog)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'adjust_for_ambient_noise',
 'dynamic_energy_adjustment_damping',
 'dynamic_energy_ratio',
 'dynamic_energy_threshold',
 'energy_threshold',
 'lasttfgraph',
 'listen',
 'listen_in_background',
 'non_speaking_duration',
 'operation_timeout',
 'pause_threshold',
 'phrase_threshold',
 'recognize_amazon',
 'recognize_api',
 'recognize_assemblyai',
 'recognize_azure',
 'recognize_bing',
 'recognize_google',
 'recognize_google_cloud',
 'recognize_houndify',
 'recognize_ibm',
 'recognize_lex',
 'recognize_sphinx',
 'recognize_tensorflow',
 'recognize_vosk',
 'recognize_whisper',
 'reco

In [19]:
samp=sr.AudioFile("nlp.wav")
samp

<speech_recognition.AudioFile at 0x209246755d0>

In [20]:

with samp as source:
    audio=recog.record(samp)


In [21]:
res=recog.recognize_google(audio)
print('Text for the Audio:\n')
print(res)


Text for the Audio:

hello welcome to NLP with py


In [22]:
def speech_to_text(file):
    samp=sr.AudioFile(file)
    with samp as source:
        audio=recog.record(samp)
    return recog.recognize_google(audio)


In [24]:
  op_text=speech_to_text("nlp2.wav")
print(op_text)  

hope you all find the sessions useful


In [25]:
from nltk.tokenize import word_tokenize
tokens=word_tokenize(op_text.lower())
tokens

['hope', 'you', 'all', 'find', 'the', 'sessions', 'useful']

In [28]:
feature_list=['hope','find','sessions']
feature_list

['hope', 'find', 'sessions']

In [29]:
review_features=[term for term in tokens if term in feature_list]
review_features


['hope', 'find', 'sessions']

In [30]:
mic=sr.Microphone()
mic.list_microphone_names()


['Microsoft Sound Mapper - Input',
 'Microphone Array (AMD Audio Dev',
 'Microsoft Sound Mapper - Output',
 'Speaker (Realtek(R) Audio)',
 'Primary Sound Capture Driver',
 'Microphone Array (AMD Audio Device)',
 'Primary Sound Driver',
 'Speaker (Realtek(R) Audio)',
 'Speaker (Realtek(R) Audio)',
 'Microphone Array (AMD Audio Device)',
 'Microphone Array 1 (AMDAfdInstall Wave Microphone - 0)',
 'Microphone Array 2 (AMDAfdInstall Wave Microphone - 0)',
 'Headphones 1 (Realtek HD Audio 2nd output with HAP)',
 'Headphones 2 (Realtek HD Audio 2nd output with HAP)',
 'PC Speaker (Realtek HD Audio 2nd output with HAP)',
 'Speakers 1 (Realtek HD Audio output with HAP)',
 'Speakers 2 (Realtek HD Audio output with HAP)',
 'PC Speaker (Realtek HD Audio output with HAP)',
 'Microphone (Realtek HD Audio Mic input)',
 'Stereo Mix (Realtek HD Audio Stereo input)']

In [33]:
with mic as source:
    audio=recog.listen(source)

In [34]:
recog.recognize_google(audio)

'welcome to day 6 NLP session'

In [35]:
def get_review_features(review_text):
    feature_list=['welcome','day','NLP']
    tokens=word_tokenize(review_text.lower())
    review_features=[term for term in tokens if term in feature_list]
    review_features=list(set(review_features))
    return review_features

In [36]:
res=recog.recognize_google(audio)
get_review_features(res)

['day', 'welcome']

# 3.Build a Text to Speech model.

In [41]:
from gtts import gTTS
import os
def text_to_speech(text,language='en',filename='ww.mp3'):
    tts=gTTS(text=text,lang=language,slow=False)
    tts.save(filename)
    os.system(f"start {filename}")

if __name__=="__main__":
    input_text=input('User text pl >>:')
    text_to_speech(input_text)

User text pl >>:Hello!Welcome to NLP with python


# 4.Build a NLP Language model to detect the sentence/word error in the text corpus.

In [44]:
import re
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [55]:
#sample dataset of correctly spelled and misspelled words
correct_words=['python','is','a','powerful','language','model','check']
misspelled_words=['pythoon','i','A','power','langage','moel','chek']
print(correct_words)
print()
print(misspelled_words)

['python', 'is', 'a', 'powerful', 'language', 'model', 'check']

['pythoon', 'i', 'A', 'power', 'langage', 'moel', 'chek']


In [56]:
#combile correct & misspelled into a single dataset
all_words=correct_words+misspelled_words
print(all_words)


['python', 'is', 'a', 'powerful', 'language', 'model', 'check', 'pythoon', 'i', 'A', 'power', 'langage', 'moel', 'chek']


In [57]:
labels=[1] *len(correct_words)+[0] *len(misspelled_words)
labels

[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [58]:
#Preprocess the data
def preprocess_text(text):
    text=re.sub(r'\b\w\b','',text) #remove single characters
    return text.lower()

all_words=[preprocess_text(word) for word in all_words]
print(all_words)

['python', 'is', '', 'powerful', 'language', 'model', 'check', 'pythoon', '', '', 'power', 'langage', 'moel', 'chek']


In [60]:
#Split the dataset into train & test
xtrain,xtest,ytrain,ytest=train_test_split(all_words,labels,test_size=0.2,random_state=4)
xtrain,ytrain

(['',
  'check',
  'chek',
  '',
  'python',
  'moel',
  '',
  'is',
  'model',
  'pythoon',
  'power'],
 [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0])

In [61]:
xtest,ytest

(['language', 'powerful', 'langage'], [1, 1, 0])

In [62]:
#Vectorize the words with BOW repr
cv=CountVectorizer()
xtrain_cv=cv.fit_transform(xtrain)
xtest_cv=cv.transform(xtest)
print(xtrain_cv)


  (1, 0)	1
  (2, 1)	1
  (4, 6)	1
  (5, 4)	1
  (7, 2)	1
  (8, 3)	1
  (9, 7)	1
  (10, 5)	1


In [63]:
print(xtest_cv)




In [64]:
#Classifier model with Naive Bayes Algorithm
clf=MultinomialNB()
clf.fit(xtrain_cv,ytrain)
#test
ypred=clf.predict(xtest_cv)
#Evaluate
accuracy=accuracy_score(ytest,ypred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 33.33%


In [65]:
#Test the spell check
def spell_check(test_word):
    test_word_vector=cv.transform([preprocess_text(test_word)])
    prediction=clf.predict(test_word_vector)
    
    if prediction[0]==1:

        print(f"{test_word} is spelled correctly.")
    else:
        print(f"{test_word} is likely misspelled ")

In [66]:
spell_check('python')

python is spelled correctly.


In [67]:
spell_check('chek')


chek is likely misspelled 
