In [31]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re, string, unicodedata
import nltk
import json
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
import gensim.downloader as gensim_api

In [8]:
lst_dics = []
with open('Downloads/News_Category_Dataset_v2.json', mode ='r', errors ='ignore') as json_file:
         for dic in json_file:
             lst_dics.append(json.loads(dic))

In [9]:
##create dtf
dtf = pd.DataFrame(lst_dics)

#filter categories
dtf = dtf[dtf["category"].isin(['ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS'
                           , 'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY'
                           , 'TECH'])][["category", "headline"]]

dtf = dtf.rename(columns={"category":"y", "headline":"text"})

dtf.sample(5)

Unnamed: 0,y,text
36704,POLITICS,Trump's Sons Say Some People May Deserve Twitt...
13248,WORLD NEWS,A B-Minus For UN Conduct?
105037,COMEDY,10 Seasons Of Colbert Pulling Things Out From ...
170167,TECH,"Websites Vary Prices, Deals Based on Users' In..."
27603,POLITICS,Uh Oh -- Most Of Iowa's Obamacare Markets Coul...


In [24]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [25]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [27]:
dtf["text_clean"] = dtf["text"].apply(lambda x:utils_preprocess_text(x,False, True
                                                                     , lst_stopwords))

In [28]:
dtf.head()

Unnamed: 0,y,text,text_clean
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,smith join diplo nicky jam 2018 world cup offi...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,hugh grant marries first time age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,jim carrey blast castrato adam schiff democrat...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,julianna margulies us donald trump poop bag pi...
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,morgan freeman devastated sexual harassment cl...


In [32]:
n= gensim_api.load("glove-wiki-gigaword-300")



In [33]:
def Normalize(text):
    """
    normalize these sentences.
    normalization is a process that converts a list of words to a more uniform sequence
    1-transform the words to a standard format 
    2- remove ASCII values
    3- remove tags
    4- part of speech tagging and lemmatization

    Parameters
    ----------
    text : list of words
        DESCRIPTION.

    Returns
    -------
    lemma list

    """
    remove_punct_dict = dict((ord(punct),None) for punct in string.punctuation)
    
    #word tokenization
    word_token = nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    
    #remove ASCII
    new_words = []
    for word in word_token:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
        
        
        
    #remove tags
    
    rmv = []
    for w in new_words:
        text = re.sub("&lt;/?.*?&gt;", "&lt;&gt;", w)
        rmv.append(text)
        
        
    #pos tagging and lemmatization
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    limitizer = WordNetLemmatizer()
    lemma_list = []
    
    rmv = [i for i in rmv if i]
    for token, tag in nltk.pos_tag(rmv):
        lemma = limitizer.lemmatize(token, tag_map[tag[0]])
        lemma_list.append(lemma)
        
        
    return lemma_list

In [37]:
Normalize(dtf["text_clean"][1])

['smith',
 'join',
 'diplo',
 'nicky',
 'jam',
 '2018',
 'world',
 'cup',
 'official',
 'song']

In [34]:
def _similarity(self, a, b):
    if self.similarity_type =='cosine':
        a = tf.nn.l2_normalize(a, -1)
        b = tf.nn.l2_normalize(b, -1)
        
        
    if self.similarity_type =='cosine' or self.similarity_type == 'inner':
        sim = tf.reduce_sum(tf.expand_dims(a,1)*b, -1)
        
        #simiilarity between intent embeddings
        sim_emb = tf.reduce_sum(b[:, 0:1, :]*b[:, 1:, :],-1)
        return sim, sim_emb
    
    
    
    else:
        raise ValueError("Wrong similarity type {}, ""should be 'cosine' or 'inner'""".format(self.similarity_type))
        