In [1]:
test_txt='''
Pupils get record GCSE grades as BTecs are pulled
GCSE passes for England's pupils, in the most disrupted academic year in UK history, have risen dramatically.

Grades have been awarded by schools, after exams were cancelled, and data shows 78.8% of papers were rated grade 4 or above. It was 69.9% in 2019.

There was a rise of a quarter in the top grades - a 7 or above, which is equivalent to an A in the old system.

The exams season has been dogged by chaos, with policy changes leading to grades being altered at the 11th hour.

In the latest debacle BTec grades were pulled hours before pupils were to receive them although some schools are giving out grades, which were assessed by schools, anyway.

And universities are still waiting for pupils' adjusted A-level results, while they attempt to squeeze as many as possible into the courses they have qualified for.

Universities and the government have now agreed to honour all degree places - this year or next - to students who have obtained the right grades, but there are concerns about the funding of these.

'''

In [2]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.linear_model import SGDClassifier
import logging
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

my_sw = ['make', 'amp',  'news','new' ,'time', 'u','s', 'photos',  'get', 'say']
def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 and token not in my_sw
  
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text)    
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [4]:
clean_text=clean_txt(test_txt)

In [5]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [6]:
word2idx = load_obj('word2idx')

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

words=tokenizer.tokenize(clean_text)

In [8]:
indexes = [word2idx[word] for word in words if word in word2idx]
len(indexes)

77

In [9]:
from keras.preprocessing.sequence import pad_sequences
input_data = pad_sequences([indexes], maxlen=60, value=29942)
input_data

array([[24205, 19526, 21745, 11229, 22747, 21474, 27096, 11229,  8769,
        18844, 26219,  9013, 23689,  7658,  4374, 20514,  4362, 15176,
        11229,   773, 12596, 15078,  6636, 11229, 21335, 12599, 21878,
          782, 23531, 10946, 11229,  1527, 23531,  1136, 28044, 25507,
        28888,   306, 15369, 22493,  1633, 25232, 16282, 20682,  5961,
        21459, 28044, 11206,   518, 12483,  6804, 20321, 29722, 18293,
        25694, 18733, 22702, 11229,  5402, 10465]])

In [10]:
def polarity_txt(text):
  return TextBlob(text).sentiment[0]

def subj_txt(text):
  return  TextBlob(text).sentiment[1]

def len_text(text):
  if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
  else:
         return 0

In [11]:
polarity=polarity_txt(test_txt)
subj=subj_txt(test_txt)
length=len_text(test_txt)

In [12]:
data ={'polarity':[polarity],
      'subj':[subj],
      'length':[length]}

df = pd.DataFrame (data, columns=['polarity','subj','length'])
df

Unnamed: 0,polarity,subj,length
0,0.216883,0.394156,0.356383


In [13]:
from keras.models import load_model
model=load_model('Bi_GRU.h5')


In [14]:
res=model.predict([input_data[0].reshape(1,60),df])

In [15]:
labels=['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE',
       'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK',
       'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT',
       'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS',
       'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS',
       'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'WEDDINGS',
       'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']

In [16]:
labels[np.argmax(res)]

'EDUCATION'