In [1]:
import sys
sys.path.append("../")
sys.path.append("../../../")

In [2]:
from features import build_features
from sklearn.model_selection import train_test_split
import pandas
import pickle
import os
from text_classification import text_encoding

Defining constants

In [3]:
DATA_PATH = "../data"

Loading raw dataset

In [4]:
applications = pickle.load(
    open(
        os.path.join(
            DATA_PATH, 
            "compressed_data/category_dataset.pkl"
            ), 
        mode='rb'
    )
)

In [5]:
applications

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


Splitting dataset

In [6]:
X, Y = applications.drop(columns=['category']), applications['category']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
x_test, x_validation, y_test, y_validation = train_test_split(x_train, y_train, test_size=0.4)

training_set = pandas.concat([x_train, y_train], axis=1)
validation_set = pandas.concat([x_validation, y_validation], axis=1)
testing_set = pandas.concat([x_test, y_test], axis=1)

Detecting anomalies

In [7]:
training_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
category             0
dtype: int64

In [8]:
validation_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
category             0
dtype: int64

In [9]:
testing_set.isna().sum()

link                 0
headline             0
short_description    0
authors              0
date                 0
category             0
dtype: int64

Building Features

In [10]:
training_set = build_features.build_features(training_set)
validation_set = build_features.build_features(validation_set)
testing_set = build_features.build_features(testing_set)

Encoding training set using TF IDF Vectors

In [11]:
training_categories = training_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(training_set)

for category in training_categories:
    encoder.encode_categorical_documents(category=category)
training_set = encoder.get_dataframe()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ...

Encoding testing set using TF/IDF Vectors

In [12]:
testing_categories = testing_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(testing_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
testing_set = encoder.get_dataframe()

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.48998941 0.         0.         ... 0.         0.         0.        ]
 [0.5224073  0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.19043701 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.    

Encoding validation set using TF/IDF Vectors

In [13]:
validation_categories = validation_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(validation_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
testing_set = encoder.get_dataframe()

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.25536517]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.29264992 0.         ... 0.         0.         0.        ]
 [0.         0.35199595 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[0.         0.38816474 0.         ... 0.41616686 0.         0.        ]
 [0.         0.         0.         ... 

Saving datasets

In [14]:
training_set.to_csv(os.path.join(DATA_PATH, 'processed_data/training_set.csv'))
validation_set.to_csv(os.path.join(DATA_PATH, 'processed_data/validation_set.csv'))
testing_set.to_csv(os.path.join(DATA_PATH, 'processed_data/testing_set.csv'))