In [None]:
!pip install transformers

import tensorflow as tf
import numpy as np
import sklearn
from sklearn.metrics import precision_score,balanced_accuracy_score,recall_score,f1_score
import transformers
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import json
#import matplotlib.pyplot as plt
import random
#import seaborn as sn

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

batch_size=32 * tpu_strategy.num_replicas_in_sync
print('Batch size:', batch_size)
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
labels,headlines,texts = [],[],[]

def read_jsondata(filename):
  data = [json.loads(line) for line in open(filename,'r')]
  random.shuffle(data)
  for line in data:
    labels.append(line['category'])
    headlines.append(line['headline'])
    text=line['headline']+' '+line['short_description'] 
    texts.append(text)
  return data
##driver function
filename = '/content/drive/MyDrive/News_Category_Dataset_v2.json'
data = read_jsondata(filename)
n_elements=len(headlines)

def aggregate_categories(labels):
    aggregated=[]
    for line in labels:
        if line=='WORLDPOST' or line=='THE WORLDPOST':
            aggregated.append('WORLD NEWS')
        elif line=='PARENTING' or line=='PARENTS':
            aggregated.append('PARENTS & PARENTING')
        elif line=='ARTS' or line=='CULTURE & ARTS':
            aggregated.append('ARTS & CULTURE')
        elif line=='STYLE':
            aggregated.append('STYLE & BEAUTY')
        elif line=='GREEN' or line=='ENVIRONMENT':
            aggregated.append('ENVIRONMENT & GREEN')
        elif line=='HEALTHY LIVING' or line=='WELLNESS' or line=='HOME & LIVING':
            aggregated.append('WELLNESS & HEALTHY LIVING')
        elif line=='COMEDY' or line=='ENTERTAINMENT' or line=='MEDIA' or line=='TRAVEL' or line=='WEDDINGS':
            aggregated.append('ENTERTAINMENT & FUN')
        elif line=='TASTE' or line=='FOOD & DRINK':
            aggregated.append('FOOD, DRINK & TASTE')
        elif line=='COLLEGE' or line=='EDUCATION':
             aggregated.append('COLLEGE & EDUCATION')
        elif line=='SCIENCE' or line=='TECH':
             aggregated.append('SCIENCE & TECH')
        elif line=='BUSINESS' or line=='MONEY':
            aggregated.append('BUSINESS & MONEY')
        elif line=='BLACK VOICES' or line=='GOOD NEWS' or line=='LATINO VOICES' or line=='QUEER VOICES':
            aggregated.append('IMPACT')

        else:
            aggregated.append(line)
    return aggregated


##
aggregated=aggregate_categories(labels)
categories=sorted(list(set(aggregated)))
n_categories=len(categories)

def indicize_labels(labels):
    """Transforms string labels into indices"""
    indices=[]
    for j in range(len(labels)):
        for i in range(n_categories):
            if labels[j]==categories[i]:
                indices.append(i)
    return indices

def tokenize(modelname):
  tokenizer = AutoTokenizer.from_pretrained(modelname)
  
  inputs = tokenizer(texts,padding = True,truncation=True,return_tensors='tf')
  dataset=tf.data.Dataset.from_tensor_slices((dict(inputs), indices))
  return dataset

##
indices=indicize_labels(aggregated) 
dataset = tokenize("bert-base-cased") 

def train_test_ds():
  val_data_size=int(0.1*n_elements)
  val_ds=dataset.take(val_data_size).batch(batch_size, drop_remainder=True) 
  train_ds=dataset.skip(val_data_size).batch(batch_size, drop_remainder=True)
  train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
  return train_ds,val_ds,val_data_size
##
train_ds,val_ds,val_data_size = train_test_ds()


def train(modelname,train,val):
  with tpu_strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(modelname, num_labels=n_categories)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.metrics.SparseCategoricalAccuracy(),tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3, name='Sparse_Top_3_Categorical_Accuracy')]
        )
  return model

##


model = train('bert-base-cased',train_ds,val_ds)
model.fit(train_ds, validation_data=val_ds, epochs=10, verbose=1)
model.save_weights('./saved_weights.h5')
model.load_weights('./saved_weights.h5')

val_labels,val_texts,val_headlines,val_description = [],[],[],[]
def val_data():
  for i in range(val_data_size):
    val_labels.append(data[i]['category'])
    headline=data[i]['headline']
    val_headlines.append(headline)
    description=data[i]['short_description']
    val_description.append(description)
    val_texts.append(headline+' '+description)
##
val_data()

def predict(modelname,model):
  val_label = indicize_labels(aggregate_categories(val_labels))
  tokenizer = AutoTokenizer.from_pretrained(modelname)
  tokens= tokenizer(val_texts, padding=True,truncation=True, return_tensors='tf')
  logits=model.predict(dict(tokens), verbose=1).logits
  prob=tf.nn.softmax(logits, axis=1).numpy()
  predictions=np.argmax(prob, axis=1)
  return predictions, val_label
##
modelname = 'bert-base-cased'
pred,val_label = predict(modelname,model)

def metrics():
  confusion_matrix = tf.math.confusion_matrix(val_label, pred, num_classes=n_categories)
  accuracy = balanced_accuracy_score(val_label,pred)
  precision = precision_score(val_label, pred, average='weighted')
  recall = recall_score(val_label, pred, average='weighted')
  f1 = f1_score(val_label, pred, average='weighted')
  print('Balanced Accuracy: {}, Precision:{}, Recall:{}, F1 score:{}.'.format(accuracy,precision, recall, f1))

##
metrics()


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Balanced Accuracy: 0.6759839833339639, Precision:0.7553249014774932, Recall:0.7551904406273339, F1 score:0.7538797109786758.


In [None]:
def sample_txt(txt,modelname):
  tokenizer = AutoTokenizer.from_pretrained(modelname)
  tokens = tokenizer(txt, padding=True,truncation=True, return_tensors='tf')
  logits=model.predict(dict(tokens), verbose=1).logits
  prob=tf.nn.softmax(logits, axis=1).numpy()
  pred=np.argmax(prob, axis=1)
  return categories[pred[0]]


In [None]:
# txt = ['The Russian defence ministry said in a statement on Wednesday that its forces were regrouping near Kyiv and Chernihiv to focus on other areas in Ukraine and complete the “liberation” of the Donbas region in the east, as the war in Ukraine enters its sixth week.',
#         'treks that focus as much on culture as scenery; and new lodges and homestays for those who want to relax and get beneath the surface of Himalayan life.',
#         'Praising accomplishments, however small, will make them feel proud; letting kids do things independently will make them feel capable and strong.',
#         'Choose your words carefully and be compassionate. Let your kids know that everyone makes mistakes and that you still love them, even when you donot love their behavior.',
#         'greenhouse gas emission',
#        'Securities in the ban period under the F&O segment include companies in which the security has crossed 95 percent of the market-wide position limit.'
# ]

In [None]:
# for i in txt:
#   p = sample_txt(i,modelname)
#   print(p)
  

WORLD NEWS
ENTERTAINMENT & FUN
PARENTS & PARENTING
PARENTS & PARENTING
ENVIRONMENT & GREEN
BUSINESS & MONEY


In [None]:
!pip install newscatcher

In [None]:
from newscatcher import Newscatcher, describe_url
import re
from datetime import datetime
import pandas as pd

summary = []
published = []
def fetch_news():
    websites = ['nytimes.com', 'theguardian.com'] #yahoo.com
    for i in websites:
      nyt = Newscatcher(website = i)
      results = nyt.get_news()
      articles = results['articles']
      for article in articles:
        summary.append(article['summary'])
        published.append(article['published'])

##
fetch_news()


fmt = '%d%b%Y%H:%M:%S'
pat = r'\b[G]\w+'
pub_tim = []
def pub_time():
  
  for i in published:
    x1 = re.sub(pat,'',i)
    x2 = x1.split(',')
    x3 = x2[1].split('+')
    x4 = x3[0]
    x4 = x4.replace(' ','')
    split_dt = datetime.strptime(x4, fmt)
    pub_tim.append(split_dt)

##
pub_time()
now_time = datetime.now()

time_diff = []
def recency():
  for i in range(0,len(pub_tim)):
    diff = now_time - pub_tim[i]
    diff = int(diff.total_seconds() / 60)
    time_diff.append(diff)

##
recency()

In [None]:
clean_txt = []
def clean_content():
  for j in summary:
    a = re.sub(r"(<[A-Za-z0-9]+)|[^a-zA-Z0-9]"," ", str(j))
    clean_txt.append(a)

##
clean_content()



In [None]:
all_cat = []
for i in clean_txt:
  cat = sample_txt(i,modelname)
  all_cat.append(cat)




In [None]:
all_cat

In [None]:
cat_count = []
def impact():
  for i in all_cat:
    a = all_cat.count(i)
    cat_count.append(a)

  df_impact = pd.DataFrame()
  df_impact['Content'] = summary
  df_impact['categories'] = all_cat
  df_impact['Age'] = time_diff
  df_impact['Count'] = cat_count
  df_impact['inf'] = df_impact["Count"].rank()
  return df_impact
 ##
impact()

Unnamed: 0,Content,categories,Age,Count,inf
0,The apparent execution of Ukrainian civilians ...,WORLD NEWS,288,10,24.5
1,A local official said he was detained and inte...,WORLD NEWS,448,10,24.5
2,"Having failed to score a quick triumph, Russia...",WORLD NEWS,737,10,24.5
3,There is little sign that a recession is immin...,BUSINESS & MONEY,168,16,60.5
4,European companies that relied on Russian cust...,BUSINESS & MONEY,168,16,60.5
...,...,...,...,...,...
103,<p>Run out of cyan? Help could be at hand as t...,WELLNESS & HEALTHY LIVING,9977,17,77.0
104,<p>The technology behind autonomous vehicles h...,SCIENCE & TECH,12888,23,97.0
105,<p>He created the image file format that defin...,SCIENCE & TECH,16668,23,97.0
106,<p>Data brokers collect our personal data and ...,BUSINESS & MONEY,18668,16,60.5


In [None]:
len(time_diff)

0

In [None]:
time_diff

[]

In [None]:
len()

In [None]:
# cat_count = []
# def impact():
#   for i in categories:
#     a = aggregated.count(i)
#     cat_count.append(a)

#   df_impact = pd.DataFrame()
#   df_impact['categories'] = categories
#   df_impact['Count'] = cat_count
#   df_impact['inf'] = df_impact["Count"].rank()

#  ##
# impact() 