In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

In [2]:
df = pd.read_csv("huffdata_clean_v2.csv")

In [48]:
df = df[['category', 'content']]

In [49]:
df['category'].value_counts()

category
POLITICS          32568
WELLNESS          16240
ENTERTAINMENT     15847
TRAVEL             8999
STYLE & BEAUTY     8340
PARENTING          7257
HEALTHY LIVING     6359
FOOD & DRINK       5733
QUEER VOICES       5479
BUSINESS           5113
SPORTS             4391
PARENTS            3652
BLACK VOICES       3638
COMEDY             3548
HOME & LIVING      3467
IMPACT             3330
THE WORLDPOST      3278
WORLD NEWS         3271
CRIME              3123
WOMEN              3112
WEDDINGS           2950
DIVORCE            2768
MEDIA              2544
WORLDPOST          2442
RELIGION           2370
WEIRD NEWS         2231
GREEN              2169
STYLE              2123
TASTE              2031
SCIENCE            1994
TECH               1749
MONEY              1612
ARTS               1484
U.S. NEWS          1371
FIFTY              1363
ARTS & CULTURE     1300
GOOD NEWS          1227
ENVIRONMENT        1220
COLLEGE             999
EDUCATION           994
LATINO VOICES       964
CULTURE

In [50]:
df_other = df[(df['category'] != "BUSINESS" )  & (df['category'] != "SPORTS") & (df['category'] != "ENTERTAINMENT")]

In [51]:
df_other.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160221 entries, 0 to 185571
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  160221 non-null  object
 1   content   160221 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [52]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.025, random_state=0)

In [53]:
sss.get_n_splits(df_other['content'],df_other['category'])

1

In [54]:
for train_index, test_index in sss.split(df_other['content'],df_other['category']):
    X_test = df['content'][test_index]
    y_test = df['category'][test_index]

In [55]:
X_test

126825    By now it's pretty clear that we really, reall...
150214    The Astor family's home once embodied the Amer...
72741     They say dog is man’s best friend and for one ...
150415    If you just need to take an island trip and le...
83558     First there was The Secret Life of Walter Mitt...
                                ...                        
22826     WASHINGTON ― The Justice Department announced ...
46991     WASHINGTON – Secretary of State John Kerry on ...
146176    Marcial Verdejo had no contact with his childr...
146485    Prince Harry stunned crowds with his daredevil...
51836     Hillary Clinton forcefully defended her suppor...
Name: content, Length: 4006, dtype: object

In [56]:
y_test

126825      FOOD & DRINK
150214     HOME & LIVING
72741          GOOD NEWS
150415            TRAVEL
83558              GREEN
               ...      
22826           POLITICS
46991           POLITICS
146176           DIVORCE
146485    STYLE & BEAUTY
51836           POLITICS
Name: category, Length: 4006, dtype: object

In [57]:
df_other_stratified = pd.DataFrame({'text' : X_test, 'labels' : y_test})

In [58]:
df_other_stratified

Unnamed: 0,text,labels
126825,"By now it's pretty clear that we really, reall...",FOOD & DRINK
150214,The Astor family's home once embodied the Amer...,HOME & LIVING
72741,They say dog is man’s best friend and for one ...,GOOD NEWS
150415,If you just need to take an island trip and le...,TRAVEL
83558,First there was The Secret Life of Walter Mitt...,GREEN
...,...,...
22826,WASHINGTON ― The Justice Department announced ...,POLITICS
46991,WASHINGTON – Secretary of State John Kerry on ...,POLITICS
146176,Marcial Verdejo had no contact with his childr...,DIVORCE
146485,Prince Harry stunned crowds with his daredevil...,STYLE & BEAUTY


In [59]:
df_other_stratified.describe()

Unnamed: 0,text,labels
count,4006,4006
unique,4006,42
top,"By now it's pretty clear that we really, reall...",POLITICS
freq,1,816


In [60]:
df_other_stratified['labels'].value_counts()

labels
POLITICS          816
ENTERTAINMENT     431
WELLNESS          249
HEALTHY LIVING    160
TRAVEL            159
QUEER VOICES      130
PARENTING         127
BUSINESS          110
FOOD & DRINK      103
SPORTS             97
THE WORLDPOST      93
STYLE & BEAUTY     92
COMEDY             90
WOMEN              87
WORLD NEWS         82
HOME & LIVING      73
CRIME              70
PARENTS            67
WORLDPOST          64
BLACK VOICES       62
MEDIA              61
IMPACT             59
GREEN              58
WEIRD NEWS         53
STYLE              52
RELIGION           51
WEDDINGS           50
DIVORCE            46
TECH               45
TASTE              39
SCIENCE            37
FIFTY              37
EDUCATION          32
ARTS & CULTURE     32
U.S. NEWS          32
ARTS               31
ENVIRONMENT        31
LATINO VOICES      31
GOOD NEWS          30
COLLEGE            18
CULTURE & ARTS     13
MONEY               6
Name: count, dtype: int64

In [61]:
df_other_stratified['labels'] = "OTHER"

In [62]:
df_other_stratified['labels'].value_counts()

labels
OTHER    4006
Name: count, dtype: int64

In [63]:
df_main = df[(df['category'] == "BUSINESS" )  | (df['category'] == "SPORTS") | (df['category'] == "ENTERTAINMENT")]

In [64]:
df_main = df_main.rename(columns={'category' : "labels", 'content' : 'text'})

In [65]:
df_main = df_main[['labels', 'text']]

In [66]:
df_main['labels'].value_counts()

labels
ENTERTAINMENT    15847
BUSINESS          5113
SPORTS            4391
Name: count, dtype: int64

In [67]:
df_train = pd.concat([df_other_stratified, df_main], ignore_index=True)

In [68]:
df_train['labels'].value_counts()

labels
ENTERTAINMENT    15847
BUSINESS          5113
SPORTS            4391
OTHER             4006
Name: count, dtype: int64

In [71]:
entertainment = df_train[df_train.labels.eq("ENTERTAINMENT")].sample(5000, random_state=0)
other = df_train[df_train.labels.eq("OTHER")]
business = df_train[df_train.labels.eq("BUSINESS")]
sports = df_train[df_train.labels.eq("SPORTS")]

In [72]:
df_under = pd.concat([entertainment, other, business, sports], ignore_index=True)

In [74]:
df_under['labels'].value_counts()

labels
BUSINESS         5113
ENTERTAINMENT    5000
SPORTS           4391
OTHER            4006
Name: count, dtype: int64

In [75]:
df_under.to_csv("huffdata_under18k.csv", index = False)