In [None]:
# The following code uses FastText official Tutorials and docs in https://fasttext.cc/

In [1]:
# install needed packages
import fasttext
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


In [2]:
# Load datasets
def load_data(file_path):
    df = pd.read_json(file_path, lines = True)
    df['category'] = pd.Categorical(df['category'])
    df = df[df["short_description"] != ""]
    df['headlineDesc'] = df['headline']+ ' ' + df['short_description']
    return df[["headlineDesc", "category"]]

df = load_data('Data/News_Category_Dataset_v2.json')
df_orig = df
df.head()


Unnamed: 0,headlineDesc,category
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


In [3]:
# Preprocessing
df = df_orig
column_names = ["category", "headlineDesc"]
df = df.reindex(columns=column_names)
num_records = df.size
print(num_records)
def shuffle_data(data):
    data = shuffle(data)
    print(data.head())
    data.reset_index(inplace=True, drop=True)
    return data
    
def fasttext_format(df_basic, per_train=0.7):
    def take_percent_rows(data, perc=0.7, headortail="head"):
        if(headortail=="head"):
            return data.head(int(len(data)*(perc)))
        else:
            return data.tail(int(len(data)*(perc)))
    def format_text(data):
        data['category_formatted'] ='__label__' + data['category'].astype(str)
        return data[['category_formatted', 'headlineDesc']]
        
    df_train = take_percent_rows(df_basic, perc=per_train, headortail="head")
    df_valid = take_percent_rows(df_basic, perc=(1-per_train), headortail="tail")
    
    df_train = format_text(df_train)
    df_valid = format_text(df_valid)
    
    return df_train, df_valid

df = shuffle_data(df)
print(df.head())
df_train, df_valid = fasttext_format(df)

print(len(df_train.index))
print(len(df_valid.index))

df_train.to_csv('Data/news_titles.train', header=None, index=None, sep='\t')
df_valid.to_csv('Data/news_titles.valid', header=None, index=None, sep='\t')


362282
              category                                       headlineDesc
21744   HEALTHY LIVING  Over One Third Of U.S. Adults Have Advanced Me...
175408   ENTERTAINMENT  Paul McCartney Says Yoko Ono Wasn't Responsibl...
4238     ENTERTAINMENT  Colin Firth’s Wife Reveals She Had An Affair W...
49571           COMEDY  Tim Kaine Blasts Donald Trump's Immigration Po...
4473          POLITICS  George W. Bush Reportedly Sounds Off On Trump:...
         category                                       headlineDesc
0  HEALTHY LIVING  Over One Third Of U.S. Adults Have Advanced Me...
1   ENTERTAINMENT  Paul McCartney Says Yoko Ono Wasn't Responsibl...
2   ENTERTAINMENT  Colin Firth’s Wife Reveals She Had An Affair W...
3          COMEDY  Tim Kaine Blasts Donald Trump's Immigration Po...
4        POLITICS  George W. Bush Reportedly Sounds Off On Trump:...
126798
54342


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category_formatted'] ='__label__' + data['category'].astype(str)


In [5]:
# Load Processed Data
# df_formatted = pd.read_csv('Data/fasttextformatted.txt')
# df_formatted.head()

In [22]:
# Training
import time

start = time.time()
model = fasttext.train_supervised(input="Data/news_titles.train")
end = time.time()
print(end - start)

3.9161806106567383


In [7]:
# Testing
model.test("Data/news_titles.valid", k=1)

(54342, 0.7021824739612086, 0.7021824739612086)

In [8]:
model.predict("Artificial Intelligence is making huge steps in improving computer decision making", k=5)

(('__label__BUSINESS',
  '__label__POLITICS',
  '__label__TECH',
  '__label__WELLNESS',
  '__label__IMPACT'),
 array([0.36344281, 0.21214932, 0.10956781, 0.09298611, 0.05803214]))

In [15]:
# Training with Other Options
model_25_epochs = fasttext.train_supervised(input="Data/news_titles.train", epoch=25, lr=1.0, wordNgrams=2)
model_25_epochs.test("Data/news_titles.valid", k=5)

(54342, 0.18536307092120274, 0.9268153546060137)

In [16]:
model_25_epochs.test("Data/news_titles.valid", k=1)

(54342, 0.7327849545471274, 0.7327849545471274)

In [10]:
model_autotuned = fasttext.train_supervised(input='Data/news_titles.train', autotuneValidationFile='Data/news_titles.valid', autotuneDuration=600)

In [11]:
model_autotuned.test("Data/news_titles.valid")

(54342, 0.7242096352729013, 0.7242096352729013)

In [12]:
model_autotuned.test_label("Data/news_titles.valid")['__label__EDUCATION']

{'precision': 0.32460732984293195,
 'recall': nan,
 'f1score': 0.6492146596858639}

In [13]:
model_autotuned.test_label("Data/news_titles.valid")['__label__POLITICS']

{'precision': 0.7615439746083752, 'recall': nan, 'f1score': 1.5230879492167504}

In [14]:
model_autotuned.test_label("Data/news_titles.valid")['__label__COLLEGE']

{'precision': 0.3893805309734513, 'recall': nan, 'f1score': 0.7787610619469026}

In [4]:
model_autotuned2 = fasttext.train_supervised(input='Data/news_titles.train', autotuneValidationFile='Data/news_titles.valid', autotuneDuration=300, autotuneMetric="f1:__label__EDUCATION")

In [5]:
model_autotuned2.test("Data/news_titles.valid")

(54342, 0.719995583526554, 0.719995583526554)

In [7]:
model_autotuned2.test_label("Data/news_titles.valid")['__label__EDUCATION']

{'precision': 0.366120218579235, 'recall': nan, 'f1score': 0.73224043715847}

In [8]:
model_autotuned2.test_label("Data/news_titles.valid")['__label__POLITICS']

{'precision': 0.7612295590974953, 'recall': nan, 'f1score': 1.5224591181949907}

In [9]:
model_autotuned2.test_label("Data/news_titles.valid")['__label__COLLEGE']

{'precision': 0.3723849372384937, 'recall': nan, 'f1score': 0.7447698744769874}

In [19]:
model_25_epochs.test_label("Data/news_titles.valid")['__label__EDUCATION']

{'precision': 0.45255474452554745,
 'recall': nan,
 'f1score': 0.9051094890510949}

In [20]:
model_25_epochs.test_label("Data/news_titles.valid")['__label__POLITICS']

{'precision': 0.7536645351697, 'recall': nan, 'f1score': 1.5073290703394}

In [23]:
model.test_label("Data/news_titles.valid")['__label__EDUCATION']

{'precision': 0.16666666666666666,
 'recall': nan,
 'f1score': 0.3333333333333333}

In [24]:
model.test_label("Data/news_titles.valid")['__label__POLITICS']

{'precision': 0.7306756503836795, 'recall': nan, 'f1score': 1.461351300767359}