In [33]:
test_txt='''
Severe mental health problems rise amid pandemic

A small number of people (3.5%) saw an improvement in their symptoms.

But 13% of people surveyed had newly developed symptoms of "moderate to severe" depression over the survey period.

People under 40, women, people with a disability and those who said they would struggle to meet an unexpected cost of £850 were the groups most likely to show symptoms of depression.

Of those surveyed who were experiencing some level of depression, people reported feeling most bothered by feelings of stress or anxiety.

Dr Charley Baker, an associate professor of mental health at the University of Nottingham, said: "It's unsurprising to see these rates of low mood and depressive symptoms emerging...The people highlighted as struggling the most are those who are already more vulnerable to low mood, anxiety and poorer wellbeing.

"It's important though to avoid over-pathologising what might be seen as reasonable responses to the current pandemic," she said.

'''

In [2]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.linear_model import SGDClassifier
import logging
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'nltk'

In [35]:
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

my_sw = ['make', 'amp',  'news','new' ,'time', 'u','s', 'photos',  'get', 'say']
def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 and token not in my_sw
  
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text)    
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [36]:
clean_text=clean_txt(test_txt)

In [37]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [38]:
word2idx = load_obj('word2idx')

In [39]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

words=tokenizer.tokenize(clean_text)

In [40]:
indexes = [word2idx[word] for word in words if word in word2idx]
len(indexes)

82

In [41]:
from keras.preprocessing.sequence import pad_sequences
input_data = pad_sequences([indexes], maxlen=60, value=29942)
input_data

array([[19954, 19903, 29508, 19903,  7358, 29590, 25682, 16770, 27948,
         5892, 11452, 15504, 24205, 26191,  7001, 26021,  9132, 15369,
         7001, 19903, 22338,  9550,  3082,  9550, 25638,  1124,  1951,
         1543, 21080, 16865, 11986, 28045, 18544, 28180, 23742, 21745,
        15826, 17539,  7003, 26191,  8450, 19903, 12249, 25682,   766,
        28854, 15826, 17539,  1124, 20594, 29161, 13054, 26816,  1791,
        17068, 23742, 21840, 22465,  6333, 19480]])

In [42]:
def polarity_txt(text):
  return TextBlob(text).sentiment[0]

def subj_txt(text):
  return  TextBlob(text).sentiment[1]

def len_text(text):
  if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
  else:
         return 0

In [43]:
polarity=polarity_txt(test_txt)
subj=subj_txt(test_txt)
length=len_text(test_txt)
cat=[polarity, subj, length]


In [44]:
data ={'polarity':[polarity],
      'subj':[subj],
      'length':[length]}

df = pd.DataFrame (data, columns=['polarity','subj','length'])
df

Unnamed: 0,polarity,subj,length
0,0.108824,0.523529,0.394904


In [45]:
from keras.models import load_model
model=load_model('LSTM_with_Multi_Input.h5')


In [46]:
res=model.predict([input_data[0].reshape(1,60),df])

In [47]:
labels=['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE',
       'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK',
       'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT',
       'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS',
       'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS',
       'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS',
       'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

In [48]:
labels[np.argmax(res)]

'FIFTY'