In [11]:
# install needed packages
import fasttext
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


In [28]:
# Load datasets
def load_data(file_path):
    df = pd.read_json(file_path, lines = True)
    df['category'] = pd.Categorical(df['category'])
    df = df[df["short_description"] != ""]
    df['headlineDesc'] = df['headline']+ ' ' + df['short_description']
    return df[["headlineDesc", "category"]]

df = load_data('Data/News_Category_Dataset_v2.json')
df_orig = df
df.head()


Unnamed: 0,headlineDesc,category
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


In [36]:
# Preprocessing
df = df_orig
column_names = ["category", "headlineDesc"]
df = df.reindex(columns=column_names)
num_records = df.size
print(num_records)
def shuffle_data(data):
    data = shuffle(data)
    print(data.head())
    data.reset_index(inplace=True, drop=True)
    return data
    
def fasttext_format(df_basic, per_train=0.7):
    def take_percent_rows(data, perc=0.7, headortail="head"):
        if(headortail=="head"):
            return data.head(int(len(data)*(perc)))
        else:
            return data.tail(int(len(data)*(perc)))
    def format_text(data):
        data['category_formatted'] ='__label__' + data['category'].astype(str)
        return data[['category_formatted', 'headlineDesc']]
        
    df_train = take_percent_rows(df_basic, perc=per_train, headortail="head")
    df_valid = take_percent_rows(df_basic, perc=(1-per_train), headortail="tail")
    
    df_train = format_text(df_train)
    df_valid = format_text(df_valid)
    
    return df_train, df_valid

df = shuffle_data(df)
print(df.head())
df_train, df_valid = fasttext_format(df)

print(len(df_train.index))
print(len(df_valid.index))

df_train.to_csv('Data/news_titles.train', header=None, index=None, sep='\t')
df_valid.to_csv('Data/news_titles.valid', header=None, index=None, sep='\t')


362282
         category                                       headlineDesc
15197    POLITICS  The Number Of Puerto Ricans Without Water Grew...
3483       COMEDY  Stephen Colbert Gets ‘Stupid Strong’ In Epic W...
165915   WELLNESS  Brain Stimulation, With Meds, Could Relieve De...
113392  WORLDPOST  Mass Urbanization Could Lead to Unprecedented ...
161151   WELLNESS  Epilepsy: What's The Condition That Caused Lil...
    category                                       headlineDesc
0   POLITICS  The Number Of Puerto Ricans Without Water Grew...
1     COMEDY  Stephen Colbert Gets ‘Stupid Strong’ In Epic W...
2   WELLNESS  Brain Stimulation, With Meds, Could Relieve De...
3  WORLDPOST  Mass Urbanization Could Lead to Unprecedented ...
4   WELLNESS  Epilepsy: What's The Condition That Caused Lil...
126798
54342


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category_formatted'] ='__label__' + data['category'].astype(str)


In [16]:
# Load Processed Data
# df_formatted = pd.read_csv('Data/fasttextformatted.txt')
# df_formatted.head()

In [39]:
# Training
import time

start = time.time()
model = fasttext.train_supervised(input="Data/news_titles.train")
end = time.time()
print(end - start)

4.056777715682983


In [57]:
# Testing
model.test("Data/news_titles.valid", k=1)

(54342, 0.7037834455853668, 0.7037834455853668)

In [45]:
model.predict("Artificial Intelligence is making huge steps in improving computer decision making", k=5)

(('__label__BUSINESS',
  '__label__POLITICS',
  '__label__TECH',
  '__label__IMPACT',
  '__label__GREEN'),
 array([0.34070903, 0.23925512, 0.12437441, 0.06793961, 0.06319401]))

In [53]:
# Training with Other Options
model_25_epochs = fasttext.train_supervised(input="Data/news_titles.train", epoch=25, lr=1.0, wordNgrams=2)
model_25_epochs.test("Data/news_titles.valid", k=5)

(54342, 0.18538515328843252, 0.9269257664421626)

In [55]:
model_25_epochs.test("Data/news_titles.valid", k=1)

(54342, 0.7325273269294469, 0.7325273269294469)