<a href="https://colab.research.google.com/github/Horcruxno13/Multi-Class-Text-Classification-using-BERT-and-PyTorch/blob/main/News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
news_df = pd.read_json('/content/drive/My Drive/Datasets/News_Category_Dataset_v3.json', lines = True)

In [None]:
news_df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [None]:
news_df.groupby(["category"])["category"].count().count()

42

In [None]:
news_df.category = news_df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [None]:
print(f"The dataset contains { news_df.category.nunique() } unique categories")

The dataset contains 41 unique categories


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
news_df['categoryEncoded'] = encoder.fit_transform(news_df['category'])

In [None]:
news_df['short_description'] = news_df['headline'] + news_df['short_description']

In [None]:
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import spacy
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def convert_to_lowercase(text):
    return text.lower()

def remove_whitespace(text):
    return text.strip()

def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
    return text.translate(str.maketrans("", "", punct_str))

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)

def remove_stopwords(text):
    regexp = RegexpTokenizer("[\w']+")
    return " ".join([word for word in regexp.tokenize(text) if word not in stopwords])

def discard_non_alpha(text):
    regexp = RegexpTokenizer("[\w']+")
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    return text_spacy

In [None]:
def cleaning(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) # converting text to one line
    text = re.sub('\[.*?\]', '', text) # removing square brackets
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_stopwords(text) 
    text = discard_non_alpha(text)
    text = text_lemmatizer(text)
    return text

In [None]:
news_df["short_description"] = news_df["short_description"].apply(cleaning)

In [None]:
news_df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,categoryEncoded
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,million americans roll sleeve omicrontargete c...,"Carla K. Johnson, AP",2022-09-23,34
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,american airlines flyer charge ban life punch ...,Mary Papenfuss,2022-09-23,34
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,funniest tweet cat dog week sept dog understan...,Elyse Wanshel,2022-09-23,5
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,funniest tweet parent week sept put grownup to...,Caroline Bologna,2022-09-23,22
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,woman call cop black birdwatcher lose lawsuit ...,Nina Golgowski,2022-09-22,34


In [None]:
cleaned_df = news_df[['short_description', 'categoryEncoded']]

In [None]:
cleaned_df.to_csv('/content/drive/My Drive/Datasets/News_Category_Dataset.csv')