In [1]:
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet')




[nltk_data] Downloading package wordnet to C:\Users\Manish
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
var_name = ["acronyms_dict", "contractions_dict", "stops"]
for var in var_name:
    if var in locals():
        print("variable exists")
    else:
        print("variable does not exist")

variable does not exist
variable does not exist
variable does not exist


In [3]:
try:
    acronyms_dict, contractions_dict, stops
except NameError:
    acronyms_dict = pd.read_json("acronym.json", typ = "series")
    contractions_dict = pd.read_json("contractions.json", typ = "series")
    stops = list(pd.read_csv('stopwords.csv').values.flatten())

In [4]:
# Defining tokenizer
regexp = RegexpTokenizer("[\w']+")

def preprocess(text):
    
    text = text.lower()                                                                                        # lowercase
    text = text.strip()                                                                                        # whitespaces
    
    # Removing html tags
    html = re.compile(r'<.*?>')
    text = html.sub(r'', text)                                                                                 # html tags
    
    # Removing emoji patterns
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r'', text)                                                                         # unicode char
    
    # Removing urls
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    text = re.sub(pattern, "", text)                                                                            # remove urls
    
    # Removing twitter usernames
    pattern = r'@[\w_]+'
    text = re.sub(pattern, "", text)                                                                            # remove @twitter usernames
    
    # Removing punctuations and numbers
    punct_str = string.punctuation + string.digits
    punct_str = punct_str.replace("'", "")
    punct_str = punct_str.replace("-", "")
    text = text.translate(str.maketrans('', '', punct_str))                                                     # punctuation and numbers
    
    # Replacing "-" in text with empty space
    text = text.replace("-", " ")                                                                               # "-"
    
    # Substituting acronyms
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_dict.index:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    text = ' '.join(words)                                                                                       # acronyms
    
    # Substituting Contractions
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_dict.index:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    text = " ".join(words)                                                                                       # contractions
    
    punct_str = string.punctuation
    text = text.translate(str.maketrans('', '', punct_str))                                                     # punctuation again to remove "'"
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)])                             # lemmatize
    
    # Stopwords Removal
    text = ' '.join([word for word in regexp.tokenize(text) if word not in stops])                              # stopwords
    
    # Removing all characters except alphabets and " " (space)
    filter = string.ascii_letters + " "
    text = "".join([chr for chr in text if chr in filter])                                                      # remove all characters except alphabets and " " (space)
    
    # Removing words with one alphabet occuring more than 3 times continuously
    pattern = r'\b\w*?(.)\1{2,}\w*\b'
    text = re.sub(pattern, "", text).strip()                                                                    # remove words with one alphabet occuring more than 3 times continuously
    
    # Removing words with less than 3 characters
    short_words = r'\b\w{1,2}\b'
    text = re.sub(short_words, "", text)                                                                     # remove words with less than 3 characters
    
    # return final output
    return text

In [5]:
preprocess("cat behind car")

'cat car'

In [6]:
# load model
model = tf.keras.models.load_model("transfer_tweet")
model.summary()



Model: "model_transfer_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 reshape_3 (Reshape)         (None, 1, 512)            0         
                                                                 
 lstm_12 (LSTM)              (None, 1, 64)             147712    
                                                                 
 dropout_13 (Dropout)        (None, 1, 64)             0         
                                                                 
 lstm_13 (LSTM)              (None, 64)                33024     
                                                                 
 dense_17 (Dense)            (None, 32)                2080      
                                                                 
 dropout_14 (Dropout)        (None, 32)       

In [7]:
y_pred = model.predict([preprocess("Hamas bombs hospital, 2000 dead, catastrophic earthquakes")])
y_pred



array([[0.9986302]], dtype=float32)

In [8]:
tf.squeeze(tf.round(y_pred)).numpy()

1.0