In [1]:
import sys
sys.path.append("../")
sys.path.append("../../../")

In [2]:
from features import build_features
from sklearn.model_selection import train_test_split
import pandas
import pickle
import os
from text_classification import text_encoding

Defining constants

In [3]:
DATA_PATH = "../data"

Loading raw dataset

In [4]:
applications = pickle.load(open(os.path.join(DATA_PATH, "compressed_data/category_dataset.pkl"), mode='rb'))

In [5]:
applications

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


Splitting dataset

In [7]:
X, Y = applications.drop(columns=['category']), applications['category']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
x_test, x_validation, y_test, y_validation = train_test_split(x_train, y_train, test_size=0.4)

training_set = pandas.concat([x_train, y_train], axis=1)
validation_set = pandas.concat([x_validation, y_validation], axis=1)
testing_set = pandas.concat([x_test, y_test], axis=1)

Detecting anomalies

In [8]:
training_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
ARTS                 0
ARTS & CULTURE       0
BLACK VOICES         0
BUSINESS             0
COLLEGE              0
COMEDY               0
CRIME                0
CULTURE & ARTS       0
DIVORCE              0
EDUCATION            0
ENTERTAINMENT        0
ENVIRONMENT          0
FIFTY                0
FOOD & DRINK         0
GOOD NEWS            0
GREEN                0
HEALTHY LIVING       0
HOME & LIVING        0
IMPACT               0
LATINO VOICES        0
MEDIA                0
MONEY                0
PARENTING            0
PARENTS              0
POLITICS             0
QUEER VOICES         0
RELIGION             0
SCIENCE              0
SPORTS               0
STYLE                0
STYLE & BEAUTY       0
TASTE                0
TECH                 0
THE WORLDPOST        0
TRAVEL               0
U.S. NEWS            0
WEDDINGS             0
WEIRD NEWS           0
WELLNESS   

In [9]:
validation_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
ARTS                 0
ARTS & CULTURE       0
BLACK VOICES         0
BUSINESS             0
COLLEGE              0
COMEDY               0
CRIME                0
CULTURE & ARTS       0
DIVORCE              0
EDUCATION            0
ENTERTAINMENT        0
ENVIRONMENT          0
FIFTY                0
FOOD & DRINK         0
GOOD NEWS            0
GREEN                0
HEALTHY LIVING       0
HOME & LIVING        0
IMPACT               0
LATINO VOICES        0
MEDIA                0
MONEY                0
PARENTING            0
PARENTS              0
POLITICS             0
QUEER VOICES         0
RELIGION             0
SCIENCE              0
SPORTS               0
STYLE                0
STYLE & BEAUTY       0
TASTE                0
TECH                 0
THE WORLDPOST        0
TRAVEL               0
U.S. NEWS            0
WEDDINGS             0
WEIRD NEWS           0
WELLNESS   

In [10]:
testing_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
ARTS                 0
ARTS & CULTURE       0
BLACK VOICES         0
BUSINESS             0
COLLEGE              0
COMEDY               0
CRIME                0
CULTURE & ARTS       0
DIVORCE              0
EDUCATION            0
ENTERTAINMENT        0
ENVIRONMENT          0
FIFTY                0
FOOD & DRINK         0
GOOD NEWS            0
GREEN                0
HEALTHY LIVING       0
HOME & LIVING        0
IMPACT               0
LATINO VOICES        0
MEDIA                0
MONEY                0
PARENTING            0
PARENTS              0
POLITICS             0
QUEER VOICES         0
RELIGION             0
SCIENCE              0
SPORTS               0
STYLE                0
STYLE & BEAUTY       0
TASTE                0
TECH                 0
THE WORLDPOST        0
TRAVEL               0
U.S. NEWS            0
WEDDINGS             0
WEIRD NEWS           0
WELLNESS   

Building Features

In [11]:
training_set = build_features.build_features(training_set)
validation_set = build_features.build_features(validation_set)
testing_set = build_features.build_features(testing_set)

Encoding training set using TF IDF Vectors

In [15]:
training_categories = training_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(training_set)

for category in training_categories:
    encoder.encode_categorical_documents(category=category)
    print('encoded category: %s' % category)
training_set = encoder.get_dataframe()

encoded category: WORLDPOST
encoded category: COMEDY
encoded category: STYLE & BEAUTY
encoded category: WELLNESS
encoded category: SCIENCE
encoded category: POLITICS
encoded category: SPORTS
encoded category: DIVORCE
encoded category: PARENTING
encoded category: HEALTHY LIVING
encoded category: STYLE
encoded category: BUSINESS
encoded category: ENTERTAINMENT
encoded category: RELIGION
encoded category: QUEER VOICES
encoded category: WEIRD NEWS
encoded category: TECH
encoded category: TRAVEL
encoded category: WOMEN
encoded category: PARENTS
encoded category: U.S. NEWS
encoded category: WORLD NEWS
encoded category: THE WORLDPOST
encoded category: CULTURE & ARTS
encoded category: IMPACT
encoded category: HOME & LIVING
encoded category: FOOD & DRINK
encoded category: TASTE
encoded category: CRIME
encoded category: FIFTY
encoded category: GOOD NEWS
encoded category: ARTS & CULTURE
encoded category: WEDDINGS
encoded category: BLACK VOICES
encoded category: EDUCATION
encoded category: MEDIA
e

Encoding testing set using TF/IDF Vectors

In [16]:
testing_categories = testing_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(testing_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
    print('encoded category: %s' % category)
testing_set = encoder.get_dataframe()

encoded category: SCIENCE
encoded category: POLITICS
encoded category: STYLE & BEAUTY
encoded category: DIVORCE
encoded category: TASTE
encoded category: ARTS
encoded category: WOMEN
encoded category: ENTERTAINMENT
encoded category: WELLNESS
encoded category: RELIGION
encoded category: CRIME
encoded category: FOOD & DRINK
encoded category: TRAVEL
encoded category: HEALTHY LIVING
encoded category: WORLD NEWS
encoded category: PARENTING
encoded category: COMEDY
encoded category: BUSINESS
encoded category: QUEER VOICES
encoded category: IMPACT
encoded category: COLLEGE
encoded category: BLACK VOICES
encoded category: ENVIRONMENT
encoded category: SPORTS
encoded category: HOME & LIVING
encoded category: GREEN
encoded category: WEDDINGS
encoded category: FIFTY
encoded category: WORLDPOST
encoded category: STYLE
encoded category: MONEY
encoded category: PARENTS
encoded category: THE WORLDPOST
encoded category: LATINO VOICES
encoded category: WEIRD NEWS
encoded category: GOOD NEWS
encoded cat

Encoding validation set using TF/IDF Vectors

In [17]:
validation_categories = validation_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(validation_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
    print('encoded category: %s' % category)
validation_set = encoder.get_dataframe()

encoded category: SCIENCE
encoded category: POLITICS
encoded category: STYLE & BEAUTY
encoded category: DIVORCE
encoded category: TASTE
encoded category: ARTS
encoded category: WOMEN
encoded category: ENTERTAINMENT
encoded category: WELLNESS
encoded category: RELIGION
encoded category: CRIME
encoded category: FOOD & DRINK
encoded category: TRAVEL
encoded category: HEALTHY LIVING
encoded category: WORLD NEWS
encoded category: PARENTING
encoded category: COMEDY
encoded category: BUSINESS
encoded category: QUEER VOICES
encoded category: IMPACT
encoded category: COLLEGE
encoded category: BLACK VOICES
encoded category: ENVIRONMENT
encoded category: SPORTS
encoded category: HOME & LIVING
encoded category: GREEN
encoded category: WEDDINGS
encoded category: FIFTY
encoded category: WORLDPOST
encoded category: STYLE
encoded category: MONEY
encoded category: PARENTS
encoded category: THE WORLDPOST
encoded category: LATINO VOICES
encoded category: WEIRD NEWS
encoded category: GOOD NEWS
encoded cat

Removing redundant features

In [18]:
for dataset in [training_set, validation_set, testing_set]:
    dataset.drop(
        columns=[
            "link",
            "authors",
            "date",
            "label",
            "category",
        ],
        inplace=True
    )

Visualizing datasets

In [19]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146668 entries, 0 to 146667
Data columns (total 90 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ARTS                     146668 non-null  bool   
 1   ARTS & CULTURE           146668 non-null  bool   
 2   BLACK VOICES             146668 non-null  bool   
 3   BUSINESS                 146668 non-null  bool   
 4   COLLEGE                  146668 non-null  bool   
 5   COMEDY                   146668 non-null  bool   
 6   CRIME                    146668 non-null  bool   
 7   CULTURE & ARTS           146668 non-null  bool   
 8   DIVORCE                  146668 non-null  bool   
 9   EDUCATION                146668 non-null  bool   
 10  ENTERTAINMENT            146668 non-null  bool   
 11  ENVIRONMENT              146668 non-null  bool   
 12  FIFTY                    146668 non-null  bool   
 13  FOOD & DRINK             146668 non-null  bool   
 14  GOOD

In [20]:
validation_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58668 entries, 0 to 58667
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ARTS                     58668 non-null  bool   
 1   ARTS & CULTURE           58668 non-null  bool   
 2   BLACK VOICES             58668 non-null  bool   
 3   BUSINESS                 58668 non-null  bool   
 4   COLLEGE                  58668 non-null  bool   
 5   COMEDY                   58668 non-null  bool   
 6   CRIME                    58668 non-null  bool   
 7   CULTURE & ARTS           58668 non-null  bool   
 8   DIVORCE                  58668 non-null  bool   
 9   EDUCATION                58668 non-null  bool   
 10  ENTERTAINMENT            58668 non-null  bool   
 11  ENVIRONMENT              58668 non-null  bool   
 12  FIFTY                    58668 non-null  bool   
 13  FOOD & DRINK             58668 non-null  bool   
 14  GOOD NEWS             

In [21]:
testing_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88000 entries, 0 to 87999
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ARTS                     88000 non-null  bool   
 1   ARTS & CULTURE           88000 non-null  bool   
 2   BLACK VOICES             88000 non-null  bool   
 3   BUSINESS                 88000 non-null  bool   
 4   COLLEGE                  88000 non-null  bool   
 5   COMEDY                   88000 non-null  bool   
 6   CRIME                    88000 non-null  bool   
 7   CULTURE & ARTS           88000 non-null  bool   
 8   DIVORCE                  88000 non-null  bool   
 9   EDUCATION                88000 non-null  bool   
 10  ENTERTAINMENT            88000 non-null  bool   
 11  ENVIRONMENT              88000 non-null  bool   
 12  FIFTY                    88000 non-null  bool   
 13  FOOD & DRINK             88000 non-null  bool   
 14  GOOD NEWS             

In [24]:
import numpy

def set_datatypes(dataset: pandas.DataFrame):
    """
    Function sets appropriate datatypes 
    for the dataset fields
    """
    dataset['average_headline_len'] = dataset['average_headline_len'].astype(numpy.float16)
    dataset['average_description_len'] = dataset['average_description_len'].astype(numpy.float16)
    dataset['headline_len'] = dataset['headline_len'].astype(numpy.int16)
    dataset['description_len'] = dataset['description_len'].astype(numpy.int16)

set_datatypes(training_set)
set_datatypes(validation_set)
set_datatypes(testing_set)

Saving datasets

In [25]:
training_set.to_csv(os.path.join(DATA_PATH, 'processed_data/training_set.csv'))
validation_set.to_csv(os.path.join(DATA_PATH, 'processed_data/validation_set.csv'))
testing_set.to_csv(os.path.join(DATA_PATH, 'processed_data/testing_set.csv'))