In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import pickle
import pandas as pd

In [5]:
import gensim

word2vec_path = "M:\MAP\GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True,limit=500000)

In [20]:
test_doc='''
It's against nature': illegal wildlife trade casts shadow over traditional Chinese medicine
This is not as much a Chinese medicine practitioner issue, it is more the industry, the people who make money,” Lao said.
Richard Thomas, communications director at Traffic, said: “The issue is very much within the TCM consciousness.”Facebook Twitter Pinterest Traditional Chinese medicine products are dispensed at a hospital in Shanxi province.
Xi is firmly behind the idea of combining traditional Chinese and western medicine, and has encouraged the acceleration of research on TCM drugs.
Facebook Twitter Pinterest Traders wait for customers at a traditional Chinese medicine market in Bozhou, Anhui province, China.
Moves are also afoot globally to stop all use of endangered wildlife in traditional medicine.
'''
test_img='''
person retro business paper sign money travel vintage commerce old people text art dollar currency card vehicle bill symbol signalise
'''

In [21]:
def clean_text(test_doc):
    '''--------Basic clean--------'''
    test_doc=test_doc.replace("\r"," ")
    test_doc=test_doc.replace("\n"," ")
    
    # Remove 's
    test_doc=test_doc.replace('"','')
    test_doc=test_doc.replace("'s","")
    
    # To lowercase
    test_doc=test_doc.lower()
    
    # Remove signa
    punctuation_signs = list("?:!.,;")

    for punct_sign in punctuation_signs:
        test_doc = test_doc.replace(punct_sign, '')
        
    test_doc = test_doc.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    test_doc = test_doc.replace("  "," ")
    
    '''--------lemmatize--------'''
    wordnet_lemmatizer = WordNetLemmatizer()
    
     # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = test_doc
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    test_doc = " ".join(lemmatized_list)
    
    '''--------remove stopwords--------'''
    stop_words=list(stopwords.words('english'))

    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        test_doc = test_doc.replace(regex_stopword, '')
    
    return test_doc

In [22]:
clean_doc=clean_text(test_doc)
clean_img=clean_text(test_img)

In [23]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokenized_doc = tokenizer.tokenize(clean_doc)
tokenized_img = tokenizer.tokenize(clean_img)

In [24]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

average_vec_doc=get_average_word2vec(tokenized_doc,word2vec)
average_vec_img=get_average_word2vec(tokenized_img,word2vec)

In [25]:
filename = 'avg_vec.mdl'
news_clf = pickle.load(open(filename, 'rb'))

In [26]:
confidence_doc=news_clf.predict_proba([average_vec_doc])
confidence_img=news_clf.predict_proba([average_vec_img])
categories=['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE',
       'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK',
       'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT',
       'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS',
       'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS',
       'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS',
       'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

res=pd.DataFrame(columns=['Categories', 'Confidence'],index=range(40))
pd.options.display.float_format = '{:.2f}%'.format
res['Categories']=categories
res['Confidence']=confidence_doc[0,:]*100

res

Unnamed: 0,Categories,Confidence
0,ARTS,0.94%
1,ARTS & CULTURE,0.55%
2,BLACK VOICES,1.33%
3,BUSINESS,7.16%
4,COLLEGE,0.46%
5,COMEDY,1.61%
6,CRIME,0.50%
7,CULTURE & ARTS,1.07%
8,DIVORCE,0.08%
9,EDUCATION,0.40%


In [27]:
prediction_doc=news_clf.predict([average_vec_doc])
prediction_doc[0]

'POLITICS'

In [28]:
prediction_img=news_clf.predict([average_vec_img])
prediction_img[0]

'BUSINESS'

In [31]:
conf=confidence_doc*0.75+confidence_img*0.25
conf

array([[0.01283173, 0.01781155, 0.01262159, 0.10912828, 0.00362053,
        0.01758421, 0.0056115 , 0.0115328 , 0.0007806 , 0.00418123,
        0.01735995, 0.01555056, 0.00218295, 0.01198729, 0.01262461,
        0.01114563, 0.02460078, 0.01629208, 0.02335077, 0.00196849,
        0.01261346, 0.05506082, 0.01484125, 0.00227819, 0.14140447,
        0.00317981, 0.0106992 , 0.01470305, 0.00994401, 0.00325948,
        0.04609305, 0.00396541, 0.02467936, 0.05409702, 0.01062906,
        0.01044955, 0.11851814, 0.00403936, 0.04280277, 0.08397541]])

In [37]:
categories[np.argmax(confidence_doc)]

'POLITICS'

In [35]:
confidence_doc

array([[0.00937816, 0.00552076, 0.01334466, 0.07164088, 0.00455365,
        0.01613356, 0.00497557, 0.01073036, 0.00084591, 0.00404534,
        0.019876  , 0.01879543, 0.00211127, 0.0144813 , 0.00099326,
        0.01361237, 0.03230748, 0.00936676, 0.02504588, 0.00215045,
        0.01421844, 0.00985106, 0.01667769, 0.00141867, 0.16797334,
        0.00388167, 0.01306042, 0.01916118, 0.01310555, 0.00330953,
        0.05286928, 0.00496727, 0.02664242, 0.04223118, 0.00207595,
        0.00422588, 0.1567152 , 0.00377317, 0.05540889, 0.10852415]])