In [None]:
import nltk 
import spacy

nlp = spacy.load('en_core_web_sm')

doc1 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc1:
    print(token.text, end= ' | ')
    
for entity in doc1.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')
    
text = """This is Mo's text, isn't it?"""

tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

# ['This', 'is', 'Mo's', 'text,', 'isn't', 'it?']

tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

# ['This', 'is', 'Mo', 's', 'text', ',', 'is', 'n't', 'it', '?']

tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

# ['This', 'is', 'Mo', "'", 's', 'text', ',', 'isn', "'", 't', 'it', '?']


    

In [None]:
# nltk.stemPorterStemmer
    #feet -> feet
    #cats -> cat
    #wolves -> wolv
    #--Fails on irregular forms, produces non-words

# WordNet lemmatizer: uses WordNet Database to lookup lemmas(base words)
# nltk.stem.WordNetLemmatizer
    #feet -> feet
    #cats -> cat
    #wolves -> wolf
    #talked -> talked
    #--Not all forms are reduced
    
import nltk
text = 'Feet cats wolves talked'
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

stemmer = nltk.stem.PorterStemmer()
' '.join(stemmer.stem(token) for token in tokens)




I) Feature Extraction from Text

-High frequency n-grams:
--Articles, prepositions, etc(and, a, the)
--They are called stop words because they have no significance in the context of the sentence

-Low frequency n-grams:
--Typos, rare n-grams.
--We don't need them, otherwise they will likely overfit.

-Medium frequency n-grams:
--Those good n-gram

Let's remove some n-grams based on their occurrence frequency in our document corpus (how many documents have a particular n-gram divided by the total number of documents). Which can be removed?
-High and Low Frequency -> words that are meaningless and types/rarely used words

In [None]:
# Word Embedding
"""
import spacy
from scipy.spatial.distance import cosine
from processing import most_common_words, vector_list

# print word and vector representation at index 347
print(most_common_words[500])

# define find_closest_words
def find_closest_words(word_list, vector_list, word_to_check):
    return sorted(word_list,
                  key=lambda x: cosine(vector_list[word_list.index(word_to_check)], vector_list[word_list.index(x)]))[:10]

# find closest words to food
close_to_food = find_closest_words(most_common_words, vector_list, 'food')
print(close_to_food)
    
"""
    
    
myfile = open(r'Coordinates.txt')
myfile.read()
myfile.seek(0)
myfile.read()

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

mystring = """Amazon just bought Ubisoft for $6 billion"""
doc1 = nlp(mystring)

for ent in doc1.ents:
    expLabel = str(spacy.explain(ent.label_))
    # print(ent.text + " - " + ent.label_ + " - " + str(spacy.explain(ent.label_)))
    print(f"{ent.text:20} {ent.label_:20} {expLabel:20}")
    # print(f'{ent.text:10} {ent.label_:8} {expLabel:7}')
    
for ent in doc1.ents:
    explainEnt = str(spacy.explain(ent.label_))
    print(f"{ent.text:10} {ent.label_:10} {explainEnt:10}")
    

In [None]:
import nltk

from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()
sents = "Running watering fainting faint fate mate mating generation generate"
tokenize = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenize.tokenize(sents)
for sent in tokens:
    print(f'{sent:20} {p_stemmer.stem(sent):20}')



words = ['Running', "Watering", "Fainting", "faint", "fate", "mate", "mating", "generation", "generate"]
print('\n')
for word in words:
    print(f"{word:20} {p_stemmer.stem(word):20}")

In [None]:
import spacy
from spacy.matcher import Matcher


nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

# Years Old
# Years-old

pattern1 = {'LOWER': 'years'}, {'LOWER': 'old'}
pattern2 = {'LOWER': 'years'}, {'IS_PUNCT':True}, {'LOWER':'old'}

matcher.add('Years Old', None, pattern1, pattern2)
doc = nlp(u'Today I just turned 4 Years Old. Next year I will be five-years-old')
found_matches = matcher(doc)
print(found_matches)



In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc1 = nlp(u'Wow that was crazy. Apple I think I did pretty well on my physics exam! That cost me $6 Billion')
    
# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc1.count_by(spacy.attrs.POS)

for ent in doc1.ents:
    explainEnt = str(spacy.explain(ent.label_))
    print(f"{ent.text:20} {ent.label_:10} {explainEnt:10}")

print(POS_counts)



for k, v in sorted(POS_counts.items()):
    print(f"{k} {doc1.vocab[k].text:10} {v}")


WITHOUT A PRETRAINED MODEl

In [None]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk


# WITHOUT A PRETRAINED MODEL

df = pd.read_csv('redditWorldNews.csv')
df.head(10)

newsTitles = df['title'].values
newsTitles

# nltk.download('punkt')
newsVec = [nltk.word_tokenize(title) for title in newsTitles]
newsVec

"""model = Word2Vec(newsVec, min_count=1, size=32)
model.most_similar('men')

vec = model['king'] - model['man'] + model['woman']
model.most_similar([vec])   """

model = Word2Vec(newsVec, min_count=1, vector_size=200)

word = 'polite'
print(model.wv.most_similar(positive=word, topn=6))

WITH A PRETRAINED MODEL

In [None]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True, limit=10000)

vec = model['king'] - model['man'] + model['woman']
vec
# model.most_similar([vec])




In [None]:
import praw
import pandas as pd
import nltk
from gensim.models import Word2Vec

posts = []
reddit = praw.Reddit(client_id='QQkRGZfy1EZLGUDza4NJvw', client_secret='IGa8eb7nt9SVW_PHP_1kX_m_kClvKA', user_agent='Webscrape')

# Get first 20 hot posts from r/LeagueOfLegends
hot_posts = reddit.subreddit('leagueoflegends').hot(limit=100)
for post in hot_posts:
    posts.append([post.title, post.score, post.subreddit, post.id, post.url, post.num_comments, post.selftext])

posts = pd.DataFrame(posts, columns=['title', 'score', 'subreddit',  'id', 'url', 'num_comments', 'body'])

worldTitles = posts['title'].values
worldVec = [nltk.word_tokenize(title) for title in worldTitles]

# similar_words = Word2Vec(worldVec, min_count=1, vector_size=200)
sim_words = []
# for i in worldVec[0]:
    # word = i
    # sim_words.append(similar_words.wv.most_similar(positive=word, topn=1))

print(posts)
# print(sim_words)

# print(posts)

In [None]:
import spacy
from spacy.tokens import Span
from spacy import displacy


nlp = spacy.load('en_core_web_sm')

def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            explanation = str(spacy.explain(ent.label_))
            print(f'{ent.text:10} {ent.label_:10} {explanation:10}')
    else:
        print('Not Found')
        
doc = nlp(u'Tesla was worth $400 in the 1900s. Crazy that Lincoln, the previous president, banned it.')
show_ents(doc)


# Get the hash for ORG
ORG = doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing DOC object 
doc.ents = list(doc.ents) + [new_ent]


print("\n\n")
# show_ents(doc)


doc = nlp(u'Our company created a brand new vacuum cleaner.' u"This new vacuum-cleaner is the best in show.")
show_ents(doc)

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('newproduct', None,*phrase_patterns)

found_matches = matcher(doc)
found_matches


# from spacy.tokens import Span
PROD = doc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]
# print(new_ents)
print(doc.ents)
doc.ents = list(doc.ents) + new_ents
print(doc.ents)

displacy.serve(doc, style='ent')


In [None]:
import spacy
from spacy.language import Language
nlp = spacy.load('en_core_web_sm')

doc = nlp(u'This is my first sentence. This is my second sentence. This is my third sentence.')
# doc_sents = [sent for sent in doc.sents]

doc1 = nlp(u"'Some say that life gives lemons; I say it does not.' - Mohamed Ilaiwi")

for sent in doc1.sents:
    print(sent)




@Language.component('component')
def set_custom_component(doc):
    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe('component', before='parser')
nlp.pipe_names
    

doc1 = nlp(u"'Some say that life gives lemons; I say it does not.' - Mohamed Ilaiwi")

for sent in doc1.sents:
    print(sent)










In [None]:
nums = [0,0,1,1,1,2,2,3,3,4]
num1 = []
for i in range(len(nums)):
    try:
        if nums[i] == nums[i+1] or nums[i] == nums[i-1]:
        # temp = nums[counter]
            nums.append(nums.pop(i))
    except:
        print(nums)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import os

# print(os.path.isfile(r'TextFiles\smsspamcollection.tsv'))

df = pd.read_csv(r'TextFiles\smsspamcollection.tsv', sep='\t')
df.head()

# df.isnull().sum()
# df['label'].value_counts()

In [None]:
def binary_search(arr, low, high, x):
 
    # Check base case
    if high >= low:
 
        mid = (high + low) // 2
 
        # If element is present at the middle itself
        if arr[mid] == x:
            return mid
 
        # If element is smaller than mid, then it can only
        # be present in left subarray
        elif arr[mid] > x:
            return binary_search(arr, low, mid - 1, x)
 
        # Else the element can only be present in right subarray
        else:
            return binary_search(arr, mid + 1, high, x)
 
    else:
        # Element is not present in the array
        return -1

nums = [-1, 0, 3, 5, 9 ,12]
binary_search(nums, 0, len(nums)-1, 2)

In [None]:
intervals = [[1,3],[2,6],[8,10],[15,18]]

# lst = list(intervals[0]) + list(intervals[1])
"""for i in range(len(intervals)-1):
    if (intervals[i][1] - intervals[i+1][0]) >= 0:
        lst.append(intervals[i][1])"""
        

In [None]:
# Perform imports and load the dataset:
import numpy as np
import pandas as pd

df = pd.read_csv(r'TextFiles\smsspamcollection.tsv', sep='\t')
df.head()

from sklearn.model_selection import train_test_split

X = df['message']  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# COUNT VECTORIZER
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

# print(X_train_counts.shape)

# TFIDF TRANSFORMER
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

# print(X_train_tfidf.shape)

# TFIDF VECTORIZER
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)


from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

# pred = clf.predict(X_test)
# pred.predict(['Baby come home'])

# Instead of having to fit_transform and count vectorizer on your test data to call the predict, you can just call a pipeline step
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
# print(confusion_matrix(y_test, predictions))

# print(classification_report(y_test, predictions))

text_clf.predict(["CALL ME BABY"])



In [None]:
import pandas as pd
import numpy as np

text1 = "Tracy loves writing about data science"
text2 = "Tracy loves posing videos about this"
text3 = "I can't wait to go to the moon!"
text4 = "You might have missed the moon, but we're still headed for mars."

corpus = [text1, text2, text3, text4]


from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
word_count_vec = vect.fit_transform(corpus)
print(word_count_vec.shape)
print(vect.get_feature_names_out())
print("Vocabulary: ", vect.vocabulary_)
print(word_count_vec.toarray())


# vector = vect.transform(corpus)
# print(vector)

# Use the conent column instead of a single text variable
# matrix = vect.fit_transform(corpus) Which is basically just word_count_vec
counts = pd.DataFrame(word_count_vec.toarray(), index=['doc1', 'doc2', 'doc3', 'doc4'], columns=vect.get_feature_names_out())
 
 # Using TFIDF Transformer to focus on more relevant data
from sklearn.feature_extraction.text import TfidfTransformer
 
tf_transformer = TfidfTransformer().fit(word_count_vec)
word_count_vec_tf = tf_transformer.transform(word_count_vec)

df1 = pd.DataFrame(word_count_vec_tf.toarray(), index=['doc1', 'doc2', 'doc3', 'doc4'], columns=vect.get_feature_names_out())
df1







In [None]:
import numpy as np
import pandas as pd


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

doc1 = "'That is to say,' replied Martin, 'that there is some pleasure having no pleasure'"
doc2 = "'It is always well to hope,' said Martin"
doc3 = "'Whereof one cannot speak, thereof one must be silent' - Ludwig Wittgenstein"
doc4 = "'The unexamined life is not worth living' - Socrates."

corpus = [doc1, doc2, doc3, doc4]

vect = CountVectorizer()
word_count_vect = vect.fit_transform(corpus)
# print(vect.get_feature_names_out())
# print('Vocabulary: ', vect.vocabulary_)



df = pd.DataFrame(word_count_vect.toarray(), index=['doc1', 'doc2', 'doc3', 'doc4'], columns=vect.get_feature_names_out())
df

tfidf_transformers = TfidfTransformer()
tfidf = tfidf_transformers.fit_transform(word_count_vect)


df1 = pd.DataFrame(tfidf.toarray(), index=['doc1', 'doc2', 'doc3', 'doc4'], columns=vect.get_feature_names_out())
df1






In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.model_selection import train_test_split


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.string_)


df = pd.read_csv(r'TextFiles\complaints.csv')
# df.head()
# n1 = math.nan
df = clean_dataset(df)

# What is our training data -> X = product, y = complaint
X = df['Product']
y = df['Consumer complaint narrative'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)

vect = CountVectorizer()
word_count_vec = vect.fit_transform(X_train)
print(word_count_vec.toarray()[:5])

tfidf = TfidfTransformer().fit(word_count_vec)
tfidf_now_vec = tfidf.transform(word_count_vec)

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

linear_svc = LinearSVC()
clf = linear_svc.fit(tfidf_now_vec, y_train)



to_predict = ["I have outdated information on my credit report that I have previously disputed thas has yet to be removed"]


# y_pred = clf.predict(X_test)
df.head()













Is a movie review positive or negative project


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(r'TextFiles\moviereviews.tsv', sep='\t')
df.head()
len(df)

# df['review'][2] --> Positive
df.isnull().sum()
df.dropna(inplace=True)

df.isnull().sum()

# Remove any spaces
blanks = []

# (index, label, review text)
for i,lb,rv, in df.itertuples():
    if rv.isspace():
        blanks.append(i)
# Blanks contains all the indexes of empty strings in df        
df.drop(blanks,inplace=True)
len(df)

# Split into training and test set
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
text_clf.fit(X_train, y_train)

predictions = text_clf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Class Reports
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

 





Semantics and Sentiment Analysis


In [None]:
import spacy
nlp = spacy.load(r'en_core_web_lg\en_core_web_lg-3.2.0')

nlp(u'The quick brown fox jumped').vector.shape
# Returns 300. 300 means that there are 300 dimensions in the vector in the document. The document is just the average of all the singular vectors

nlp(u'fox').vector.shape
tokens = nlp(u'like love hate')
print(f'{"Tok1":10} {"Tok2":10} {"Similarity":10}')

for token1 in tokens:
    for token2 in tokens:
        # print(token1.text, token2.text, token1.similarity(token2))      
        print(f'{token1.text:10} {token2.text:10} {token1.similarity(token2):10}')

nlp.vocab.vectors.shape
# (684831, 300) --> (amtOfWords, dimensions)

tokens = nlp(u'dog cat John')
print('\n')
# OOV = Out of Vocabulary
for token in tokens:
    print(f"{token.text:10} {token.has_vector:10} {token.vector_norm:10} {token.is_oov:10}")
    
    
from scipy import spatial 
cosine_similarity = lambda vec1, vec2: 1-spatial.distance.cosine(vec1, vec2)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

new_vector = king - man + woman
computed_similarities = []

# FOR ALL WORDS IN MY VOCAB. Basically all 684000 words
"""
for loop through nlp.vocab.vectors gives back the hash value of the lexemes
we need to first transform the hash value back to lexeme object
"""
for ID in nlp.vocab.vectors:

    word = nlp.vocab[ID]
    if word.has_vector:
        if word.is_lower:
            # Is the word a number
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))
 
 
# -item[1] is descending order. Descending gives you most similar, ascending gives you least similar
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
sim = [t[0].text for t in computed_similarities[:10]]
sim
new_vector




   

In [None]:
import spacy
import pandas as pd
from scipy import spatial
nlp = spacy.load(r'site-packages\en_core_web_lg\en_core_web_lg-3.2.0')

def most_similar(vect):
    vector = nlp.vocab[vect].vector
    vect1 = nlp(vect)
    
    word_similarity = lambda vec1, vec2: 1- spatial.distance.cosine(vec1, vec2)
    common_similarities = []
    
    for ID in nlp.vocab.vectors:
        word = nlp.vocab[ID]
        if word.has_vector:
            if word.is_lower:
                if word.is_alpha:
                    similarity = word_similarity(vector, word.vector)
                    common_similarities.append((word, similarity))
                    
    
    sort_common_sim = sorted(common_similarities, key= lambda item: -item[1])
    sim = [t[0].text for t in sort_common_sim[:10]]
    get_sim = [vect1.similarity(nlp(i)) for i in sim]
    return sim, get_sim


most_similar('compuetwe')
    
                    
                    
        


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(r'TextFiles\complaints.csv')

from sklearn.model_selection import train_test_split


df.isnull().sum()
df.dropna(inplace=True)

y = df['Product']
X = df['Consumer complaint narrative']


for item, frame in df['Product'].iteritems():
    if not pd.notnull(frame):
        print(item, frame)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
text_clf.fit(X_train, y_train)

predictions = text_clf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# print(confusion_matrix(y_test, predictions))
# print(classification_report(y_test, predictions))
# print(accuracy_score(y_test, predictions))
       
print(text_clf.predict(['I have outdated information on my credit report that I have previously disputed thas has yet to be removed']))
df['Product'].value_counts()

In [None]:
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
a = 'This is a good movie'
sid.polarity_scores(a)

a = 'This was the best, most awesome movie EVER MADE!!!!'
sid.polarity_scores(a)

a = "This was the WORST movie I has ever disgraced the screen."
sid.polarity_scores(a)


import pandas as pd
df = pd.read_csv(r'TextFiles\amazonreviews.tsv', sep='\t')
df.head()

# To see how many positive or negative reviews we have
df['label'].value_counts()
df.dropna(inplace=True)


blanks = []
for i, lb, rv in df.itertuples():
    # (index, label, review)
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

# NO blanks but if we did it would be: 
# df.drop(blanks, inplace=True)
sid.polarity_scores(df.iloc[0]['review'])

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

df['compound'] = df['scores'].apply(lambda d: d['compound'])
df.head()

df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')
df.head()


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(df['label'], df['comp_score']))
print(classification_report(df['label'], df['comp_score']))
print(confusion_matrix(df['label'], df['comp_score']))


Applying Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv(r'TextFiles\moviereviews.tsv', sep='\t')
df.head()
df['label'].value_counts()

df.dropna(inplace=True)

blanks = []
for i, lb, rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
df.drop(blanks, inplace=True)

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

df['compound'] = df['scores'].apply(lambda d: d['compound'])
df.head()

df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(df['label'], df['comp_score']))
print(classification_report(df['label'], df['comp_score']))
print(confusion_matrix(df['label'], df['comp_score']))







In [None]:
import praw
import pandas as pd
import nltk
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import random

nlp = spacy.load('en_core_web_sm')
sid = SentimentIntensityAnalyzer()

posts = []
reddit = praw.Reddit(client_id='', client_secret='', user_agent='Webscrape')



# Get first 20 hot posts from r/LeagueOfLegends
hot_posts = reddit.subreddit('all').hot(limit=100)
for post in hot_posts:
    # posts.append([post.title, post.score, post.subreddit, post.id, post.url, post.num_comments, post.selftext])
    posts.append([post.title, post.subreddit])
posts

In [None]:
import praw
import pandas as pd
import nltk
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import random

nlp = spacy.load('en_core_web_sm')
sid = SentimentIntensityAnalyzer()

posts = []
reddit = praw.Reddit(client_id='QQkRGZfy1EZLGUDza4NJvw', client_secret='IGa8eb7nt9SVW_PHP_1kX_m_kClvKA', user_agent='Webscrape')



# Get first 20 hot posts from r/LeagueOfLegends
hot_posts = reddit.subreddit('all').hot(limit=100)
for post in hot_posts:
    # posts.append([post.title, post.score, post.subreddit, post.id, post.url, post.num_comments, post.selftext])
    posts.append([post.title, post.subreddit])
        
# posts = pd.DataFrame(posts, columns=['title', 'score', 'subreddit',  'id', 'url', 'num_comments', 'body'])
posts = pd.DataFrame(posts, columns=['title', 'subreddit'])

worldTitles = posts['title'].values
def clean_up(titles=posts['title'].values):
    non_punct = []
    stop_words = set(nlp.Defaults.stop_words)
    
    for i in worldTitles:
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        new_words = tokenizer.tokenize(i)
        filtered = [w for w in new_words if not w.lower() in stop_words]
        # print(new_words)
        non_punct.append(filtered)
    return non_punct


cv = CountVectorizer(max_df = .9, min_df=2, stop_words='english')
dtm = cv.fit_transform(posts['title'])

# LDA approach    
LDA = LatentDirichletAllocation(n_components=7, random_state=42)
LDA.fit(dtm)    

# NMF approach
tfidf = TfidfVectorizer(max_df=9, min_df=2, stop_words='english')
dtm1 = tfidf.fit_transform(posts['title'])

nmf_model = NMF(n_components=7, random_state=42)
nmf_model.fit(dtm1)


#import random
random_word_id = random.randint(0, len(cv.get_feature_names_out()))


# Get the topics
single_topic = LDA.components_[0]

top_five_words = single_topic.argsort()[-10:]


for index , topic in enumerate(LDA.components_):
    print(f"THE TOP 10 WORDS FOR TOPIC #{index} BY LDA")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')



for index, topic in enumerate(nmf_model.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{index} BY NMF")
    print([tfidf.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')
   
topic_result = nmf_model.transform(dtm1)

#----------------------------------- FOR DATAFRAME ------------------------------------#
posts['scores'] = posts['title'].apply(lambda title: sid.polarity_scores(title))

# posts['negative'] = posts['scores'].apply(lambda p: p['neg'])
# posts['neutral'] = posts['scores'].apply(lambda p: p['neu'])
# posts['positive'] = posts['scores'].apply(lambda p: p['pos'])
posts['compound'] = posts['scores'].apply(lambda p: p['compound'])


posts['rating'] = posts['compound'].apply(lambda score: 'pos' if score > 0 else 'neg' if score < 0 else 'neutral')
#----------------------------------- FOR DATAFRAME ------------------------------------#



# clean_up(worldTitles)
posts['rating'].value_counts()
pd.set_option('display.max_colwidth', None)

topic_results = LDA.transform(dtm)
posts['Topic By LDA'] = topic_results.argmax(axis=1)
posts['Topic by NMF'] = topic_result.argmax(axis=1)
posts



In [None]:
import pandas as pd

npr = pd.read_csv(r'05-Topic-Modeling\npr.csv')
npr.head()

# npr['Article'][3]
# len(npr)

from sklearn.feature_extraction.text import CountVectorizer

# Removes words that appear in 90% of the documents, words that show up a minimum amout of times (min_df can either be a ratio or has to appear in atleast 2 documents), remove stop_words 
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
dtm = cv.fit_transform(npr['Article'])
dtm
# <11992x54777> Articles, Terms
from sklearn.decomposition import LatentDirichletAllocation
# n_components, how many topics do I want. International, Local, National Politics
LDA = LatentDirichletAllocation(n_components=7, random_state=42)
LDA.fit(dtm)

# Grab the Vocabulary of Words
len(cv.get_feature_names_out())

import random
random_word_id = random.randint(0, len(cv.get_feature_names_out()))
cv.get_feature_names_out()[random_word_id]

# Grab the topics
# LDA.components_
# First topic
single_topic = LDA.components_[0]

# Take the single topics and figure out which index position we should be looking at for high probability words in the single topic.
single_topic.argsort()

import numpy as np
arr = np.array([10, 200, 1])
# Gives the index position that would sort this
arr.argsort()

# grab the last 10 values of argsort
# single_topic.argsort()[-10:]
top_ten_words = single_topic.argsort()[-10:]

for index in top_ten_words:
    print(cv.get_feature_names_out()[index])


# Grab the highest probabilty words per topic

# LDA components are just hte topics of the article
# Of each topic, we are gettin g the top 15 words
for index , topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{index}")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')
    
    
topic_results = LDA.transform(dtm)
# Array (11992, 7) Articles, Topics
# Probability that a documents belongs to a particular topic ^^^

npr['Topic'] = topic_results.argmax(axis=1)
npr




In [None]:
import pandas as pd

npr = pd.read_csv(r'05-Topic-Modeling\npr.csv')

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=.9, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
dtm 

from sklearn.decomposition import NMF
nmf_model = NMF(n_components=7, random_state=42)
nmf_model.fit(dtm)


# LDA words with high probability
# NMF words with high coeffecient values
for index, topic in enumerate(nmf_model.components_):
    print(f'The top 15 results for topic # {index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

topic_results = nmf_model.transform(dtm)

topic_results.argmax(axis=1)
npr['Topic'] = topic_results.argmax(axis=1)

mytopic_dict = {0: 'Health', 1: 'Politics', 2:'Legislation', 3:'Foreign Affairs', 4: 'Election', 5: 'Music', 6: 'Education'}
npr['Topic Label'] = npr['Topic'].map(mytopic_dict)
npr.head()


In [None]:
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
# print(iris.DESCR)

X = iris.data
X
# (SL, SW, PL, PW)
# X

y = iris.target
# y 

# Want a vector that is zero for every value that the match does not match up
# One Hot Encoding
# class 0 --> [1, 0, 0]
# class 1 --> [0, 1, 0]
# class 2 --> [0, 0, 1]

from tensorflow.keras.utils import to_categorical
y = to_categorical(y)
# y

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)

# For Neural networks it is good to scale or standardize your data
from sklearn.preprocessing import MinMaxScaler
# np.array([5, 10, 15, 20])/ 20
scaler_object = MinMaxScaler()
scaler_object.fit(X_train)


scaled_X_train = scaler_object.transform(X_train)
# print(scaled_X_train)
scaled_X_test = scaler_object.transform(X_test)

# scaled_X_train
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics

model = Sequential()
# Dense(neurons, )
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(8, input_dim=4, activation='relu'))

# Output layer
# 3 Neurons because each neuron will have a probability in each particular class [0, 0, 1] [1,0,0] [0,1,0]
# Will be in probability / percentages
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# model.summary()

# Verbose how much information you want bacj
#model.fit(scaled_X_train, y_train, epochs=150, verbose=2)

# Predict on new unseen data.
# Have to scale if you add new data
predict_x=model.predict(scaled_X_test)
classes_x=np.argmax(predict_x,axis=1)
classes_x

# on y_test: the [00000000,11111,222222]
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test.argmax(axis=1), classes_x))
print(classification_report(y_test.argmax(axis=1), classes_x))

model.save('myfirstmodel.h5')
from keras.models import load_model
new_model = load_model('myfirstmodel.h5')
                                                


In [None]:
def read_file(filepath):
    with open(filepath) as f:   
        str_text = f.read()
    
    return str_text

# read_file(r'06-Deep-Learning\moby_dick_four_chapters.txt')

# read_file(r'06-Deep-Learning\melville-moby_dick.txt')


import spacy 
nlp = spacy.load(r'en_core_web_lg\en_core_web_lg-3.2.0', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 1198623


def seperate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

d = read_file(r'moby_dick_four_chapters.txt')

tokens = seperate_punc(d)
# len(tokens)

# 25 words --> network predict #26
train_len = 25 + 1
text_sequence = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequence.append(seq)
    
# ' '.join(text_sequence[0]) First sentence ->>> call me ishmael
# ' '.join(text_sequence[1]) Second sentence shifted by 1 ->>> me ishmael some 
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequence)

sequences = tokenizer.texts_to_sequences(text_sequence)
# sequences[0]

# Dictionary with index and word
tokenizer.index_word

# Number is not a count, just a unique ID
# for i in sequences[0]:
#    print(f"{i} : {tokenizer.index_word[i]}")


# tokenizer.word_counts
vocabulary_size = len(tokenizer.word_counts)

import numpy as np
sequences = np.array(sequences)
sequences
    
# Preform feature train split- seperates first 25 as features, last one as predict
from tensorflow.keras.utils import to_categorical
# Grab every column except the last one
X = sequences[:,:-1]
# Rows: Columns
# All the rows, last column
y = sequences[:,-1]

y = to_categorical(y, num_classes=vocabulary_size+1)
seq_len = X.shape[1]
# Sequences, how many words per sequence (11312, 25) X.shape

from keras.models import Sequential
# Dense for layers, LSTM to deal with sequences, embedding for vocabulary
from keras.layers import Dense, LSTM, Embedding


# Input, Output 
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*2, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model

model = create_model(vocabulary_size+1, seq_len)

# Save the file and load it later
from pickle import dump, load
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

model.save('my_mobydick_model.h5')
dump(tokenizer, open('my_simpletokenizer', 'wb'))


from keras.preprocessing.sequence import pad_sequences

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # If you have too many words, makes sure its only 25.
        # If too little, pads it up to 25
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        
        # Predict class probability for each word. Assign a probability for the most likely next word
        pred_word_inds = model.predict(pad_encoded, verbose=0)[0]
        pred_word_ind = np.argmax(pred_word_inds, axis=0)
        
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' '+pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

# generate_text(model, tokenizer, seq_len, seed_text, num_gen_words)


import random
random.seed(101)
random_pick = random.randint(0, len(text_sequence))

random_seed_text = text_sequence[random_pick]
random_seed_text

seed_text = ' '.join(random_seed_text)
seed_text

generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=25)

import pickle

filename = r'my_mobydick.h5'

with open(r'my_mobydick_model.h5') as f:
    

from keras.datasets import mnist


# X_train, X_test, y_train, y_test
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# 60,000 images, each 28x28 pixels
train_images.shape

# Feed the Neural network the training data -> train_images, train_labels
# It will learn to associate images and labels
# Ask to produce predictions for the testing data

from keras import models, layers

network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(layers.Dense(10, activation='softmax'))


# Layer - data-processing module that acts as a filter for data.
# Deep learning consists of chaining the smaller layers that will implement a form of progressive data distiliation --> process of transferring knowledge from a large model to a smaller one

# Make the model ready for training 
#    -Loss function: How the network will measure its performance on training data (Steer in right direction)
#    -Optimizer: Mechanism where the network updates itself based on the data it sees and loss funct
#    -Metrics to monitor during training and testing: Accuracy, confusion matrix, classification report


# The Compilation step
network.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])


# Previously, was stored in shape (60000, 28, 28). Tranform into shape (60000, 28 * 28) with values between 0 and 1
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255

test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255


# Categorically encode the labels
# One Hot Encoding
from tensorflow.keras.utils import to_categorical

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

# Ready to fit the model to its training data
network.fit(train_images, train_labels, epochs=5, batch_size=128)

# Two Quantities are displayed: The loss of the network over the training and the accuracy of the network
# Model reaches an accuracy of .989 (98.9%)
# Check what the model performs on the test set
test_loss, test_acc = network.evaluate(test_images, test_labels)

# print('test_loss:', test_loss)
# print('test_acc:', test_acc)
# .07% loss. Model is slightly overfitted: performs worse on test data than train

# ---------------------------------------------------------------
# Tensor = Arrays

# tells you the amount of axis on the tensor. In this case: 2
# print(train_images.ndim)

# Our current model is a 3D tensor of 8-bit integers. Array of 60000 matrices of 28*28 integers. Each matrix is a grawscale image, with values 0-255
# print(train_images.shape) --> (60000, 28, 28)
# print(train_images.dtype) --> uint8


# ------------------------
# You can split pixel image
my_slice = train_images[10:100]
# Shape (90, 28, 28)
# Select 14 x 14 pixels in the bottom-right corner of all images:
my_slice = train_images[:, 14, 14]




Text Generation:
-LSTM(Long Short-Term Memory)
Random Sampling to produce more realistic and unrepetitive words. 


In [None]:




import numpy as np

with open(r'train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

with open(r'test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

len(train_data)
' '.join(train_data[0][0])
train_data[0][2]

all_data = test_data + train_data
len(all_data)


vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
    
vocab.add('no')
vocab.add('yes')

vocab_len = len(vocab) + 1

all_story_lens = [len(data[0]) for data in all_data]
max_story_len = max(all_story_lens)

max_question_len = max([len(data[1]) for data in all_data])
max_question_len


from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

tokenizer.word_index

train_story_text = []
train_question_text = []
train_answers = []

for story, question, answers in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answers)


# List with three 
# train_story_text 

train_story_seq = tokenizer.texts_to_sequences(train_story_text)
train_story_seq

def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    # Stories = X
    X = []
    # Questions = Xq
    Xq = []
    # Y = Target = Correct Answer (yes/no)
    Y = []
    
    for story, question, answer in data:
        # for each story
        # Example: [23, 14, 15....]
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in question]
        
        # Since we're using pad sequence, index zero is taken
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

inputs_test        

# 34 
tokenizer.word_index['yes']
# 4
tokenizer.word_index['no']   
sum(answers_test)  
 
# 503: No, 496: Yes


from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

# PLACEHOLDER shape=(max_story_len, batch_size)
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

# vocab_len
vocab_size = len(vocab) + 1

# INPUT ENCODER M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

# (samples, stories_maxlen, embedding_dim)

# Input ENCODER C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

#OUTPUT
#(samples,story_maxlen_max_question_len)


# QUESTION ENCODER
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

# (samples,question_maxlen, embedding_dim)


# Result of Encoder -> Encoded
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)


# Converting to have output of samples by question max_len to story max_len
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

answer = concatenate([response, question_encoded])
answer


answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer) # (samples,vocab_size) # YES/NO 0000

# Probability matrix
answer = Activation('softmax')(answer)

model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

        




##   Import the Libaries
*   First step: Import the required libaries

In [None]:
# Keras module for building LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.callbacks import EarlyStopping
import keras.utils as ku

# Set seeds for repdocability
from tensorflow.random import set_seed
from numpy.random import seed

# What does this do ------
set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os


curr_dir = r'\Desktop\Data'
all_headlines = []

for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        curr_file = curr_dir + '\\' + filename
        print(filename, curr_file)
        article_df = pd.read_csv(curr_file)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h != 'Unknown']
# len = 831







In [None]:
from IPython.display import clear_output
clear_output(wait=True)

#  Data Preparation
###   Dataset Cleaning

*   Perform text cleaning of the data which includes the removal of punctuations and lower casing 
   

In [None]:
def clean_text(txt): 
    txt = ''.join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode('utf8').decode('ascii', 'ignore')
    
    return txt

corpus = [clean_text(x) for x in all_headlines]
print(all_headlines[:10], '\n\n')
corpus[:10]

### Generating Sequences of N-gram Tokens
*   Language modeling requires a sequence of input data, as given a sequence of words/tokens. The aim is to predict the next word/token
*   Next Step is tokenization

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    # Convert data to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]


    
    

*   Every number represents the ngram phrases generated from the input data.
*   Every integer corresponds to the index of a partical word in the complete vocabulary of words present in the text

### Padding the Sequences to obtain variables
*   Since every headline may have a different length, have to pad so that the lenghts are equal.

*   To input the data into a learning model, you have to create predictors and labels

### Headline: they are learning data science
| Predictors            | Label           |
|-----------------------|-----------------|
| they                  | are             |
| they are              | learning        |
| they are learning     | data            |
| they are learning data| science         |

In [None]:
from tensorflow.keras.utils import to_categorical

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = to_categorical(label, num_classes= total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

## LSTMs for Text Generation


In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM layer
    model.add(LSTM(100))
    model.add(Dropout(.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()
    
    
    
    

# Train Model


In [None]:

model.fit(predictors, label, epochs=5, verbose=5)



model.save('temporary.h5')




# Generating the Text
* Can train the model after it finishes fitting
* Function predicts the next word based on the input words (seed text)
* Tokenize the seed text -> Pad the sequences -> pass into the trained model to get predicted

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        predict = model.predict(token_list)
        # predicted = model.predict_classes(token_list, verbose=0)
        classes = np.argmax(predict, axis=1)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()   



generate_text('Covid', 8, new_model, max_sequence_len)
# generate_text('Obama', 8, model, max_sequence_len)