
# NEWS category detector

This dataset contains around 200k news headlines from the year 2012 to 2018 obtained from HuffPost. The model trained on this dataset could be used to identify tags for untracked news articles or to identify the type of language used in different news articles.

https://www.kaggle.com/rmisra/news-category-dataset

In [1]:
import json
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle as pk

from ipywidgets import IntProgress
from IPython.display import display
import time
from time import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Data pre-processing
### Importing data and creating a parsed out database

In [50]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [51]:
# Check size of each category
df.groupby('category').size()

category
ARTS               1509
ARTS & CULTURE     1339
BLACK VOICES       4528
BUSINESS           5937
COLLEGE            1144
COMEDY             5175
CRIME              3405
CULTURE & ARTS     1030
DIVORCE            3426
EDUCATION          1004
ENTERTAINMENT     16058
ENVIRONMENT        1323
FIFTY              1401
FOOD & DRINK       6226
GOOD NEWS          1398
GREEN              2622
HEALTHY LIVING     6694
HOME & LIVING      4195
IMPACT             3459
LATINO VOICES      1129
MEDIA              2815
MONEY              1707
PARENTING          8677
PARENTS            3955
POLITICS          32739
QUEER VOICES       6314
RELIGION           2556
SCIENCE            2178
SPORTS             4884
STYLE              2254
STYLE & BEAUTY     9649
TASTE              2096
TECH               2082
THE WORLDPOST      3664
TRAVEL             9887
WEDDINGS           3651
WEIRD NEWS         2670
WELLNESS          17827
WOMEN              3490
WORLD NEWS         2177
WORLDPOST          2579
dtype: 

In [52]:
# Merging categories
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "ARTS" else x)
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "CULTURE & ARTS" else x)
df.category = df.category.map(lambda x: "STYLE & BEAUTY" if x == "STYLE" else x)
df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)
df.category = df.category.map(lambda x: "ENVIRONMENT" if x == "GREEN" else x)
df.category = df.category.map(lambda x: "PARENTS" if x == "PARENTING" else x)
df.category = df.category.map(lambda x: "EDUCATION" if x == "COLLEGE" else x)
df.category = df.category.map(lambda x: "HOME & LIVING" if x == "HEALTHY LIVING" else x)
df.category = df.category.map(lambda x: "FOOD & DRINK" if x == "TASTE" else x)
df.category = df.category.map(lambda x: "WORLD NEWS" if x == "WORLDPOST" else x)

# Remove categories to lower dataset size
df = df[df['category'] != 'COMEDY']
df = df[df['category'] != 'FIFTY']
df = df[df['category'] != 'BLACK VOICES']
df = df[df['category'] != 'IMPACT']
df = df[df['category'] != 'GOOD NEWS']
df = df[df['category'] != 'LATINO VOICES']
df = df[df['category'] != 'PARENTS']
df = df[df['category'] != 'QUEER VOICES']
df = df[df['category'] != 'WEIRD NEWS']

In [53]:
# Re-check size
df.groupby('category').size()

category
ARTS & CULTURE     3878
BUSINESS           5937
CRIME              3405
DIVORCE            3426
EDUCATION          2148
ENTERTAINMENT     16058
ENVIRONMENT        3945
FOOD & DRINK       8322
HOME & LIVING     10889
MEDIA              2815
MONEY              1707
POLITICS          32739
RELIGION           2556
SCIENCE            2178
SPORTS             4884
STYLE & BEAUTY    11903
TECH               2082
TRAVEL             9887
WEDDINGS           3651
WELLNESS          17827
WOMEN              3490
WORLD NEWS         8420
dtype: int64

In [54]:
df.shape

(162147, 6)

In [55]:
# merge headline and description
df['text'] = df['headline']+' '+df['short_description']
df.drop(['short_description','headline'],axis=1,inplace=True)

In [56]:
df

Unnamed: 0,category,authors,link,date,text
0,CRIME,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
...,...,...,...,...,...
200848,TECH,"Reuters, Reuters",https://www.huffingtonpost.com/entry/rim-ceo-t...,2012-01-28,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849,SPORTS,,https://www.huffingtonpost.com/entry/maria-sha...,2012-01-28,Maria Sharapova Stunned By Victoria Azarenka I...
200850,SPORTS,,https://www.huffingtonpost.com/entry/super-bow...,2012-01-28,"Giants Over Patriots, Jets Over Colts Among M..."
200851,SPORTS,,https://www.huffingtonpost.com/entry/aldon-smi...,2012-01-28,Aldon Smith Arrested: 49ers Linebacker Busted ...


In [57]:
# Load english stopwords
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laure\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
### TO BE COPIED IN DATA PREPARATION SCRIPT ###

# Parsing text + cleaning + stemm
def parse_out_text(all_text):
    # clean punctuation, make lower case and remove stopwords
    text_string = all_text.translate(str.maketrans("", "", string.punctuation)).split(" ")
    text_string = [word.lower() for word in text_string if word.lower() not in stopwords.words('english')]
    # Stemm text
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for  word in text_string]
    words = " ".join(stemmed) 
    return words

In [59]:
# Text example
text_test = df['text'].iloc[42]
text_test

'Harvey Weinstein Accusers Say They Never Thought He Would Be Arrested Some of the actresses who came forward with sexual assault claims against the once powerful Hollywood mogul responded with both surprise and relief.'

In [60]:
# Parsed text example
parse_out_text(text_test)

'harvey weinstein accus say never thought would arrest actress came forward sexual assault claim power hollywood mogul respond surpris relief'

In [61]:
# Resetting index (num rows = id)
df = df.reset_index()
df.head()

Unnamed: 0,index,category,authors,link,date,text
0,0,CRIME,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...
1,1,ENTERTAINMENT,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
2,2,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 5...
3,3,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,4,ENTERTAINMENT,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...


In [62]:
# Create a column with parsed-out text and transform whole dataset

def create_parsed_df(extract_df):

    pd.options.mode.chained_assignment = None  # default='warn'

    extract_df['parsed_text'] = 'NaN'
    extract_df.drop(['index','authors','link','date'],axis=1, inplace=True)

    max_count = extract_df.shape[0]

    f = IntProgress(min=0, max=max_count) # instantiate the bar
    display(f) # display the bar

    for i in range(len(extract_df['text'])):
        f.value += 1
        item = parse_out_text(extract_df['text'].iloc[i])
        extract_df['parsed_text'][i] = item
    
    extract_df = extract_df[['parsed_text','category']]
    
    # dump dataset in a pickle file 
    with open("df_parsed_text.pkl", 'wb') as file:
        pk.dump(extract_df, file)
    
    extract_df.to_csv('data.csv')
    
    return extract_df

In [63]:
## TAKES SOME TIME AND REPLACES df_parsed_text.pkl -> DON'T TOUCH WITHOUT GOOD REASON !
create_parsed_df(df)

IntProgress(value=0, max=162147)

Unnamed: 0,parsed_text,category
0,2 mass shoot texa last week 1 tv left husband ...,CRIME
1,smith join diplo nicki jam 2018 world cup offi...,ENTERTAINMENT
2,hugh grant marri first time age 57 actor longt...,ENTERTAINMENT
3,jim carrey blast castrato adam schiff democrat...,ENTERTAINMENT
4,julianna marguli use donald trump poop bag pic...,ENTERTAINMENT
...,...,...
162142,rim ceo thorsten hein signific plan blackberri...,TECH
162143,maria sharapova stun victoria azarenka austral...,SPORTS
162144,giant patriot jet colt among improb super bow...,SPORTS
162145,aldon smith arrest 49er lineback bust dui corr...,SPORTS


### Dumping an extract of the full db

In [99]:
# Load full parsed dataset
with open("df_parsed_text.pkl", 'rb') as fid:
    df_full = pk.load(fid)
df_full

Unnamed: 0,parsed_text,category
0,2 mass shoot texa last week 1 tv left husband ...,CRIME
1,smith join diplo nicki jam 2018 world cup offi...,ENTERTAINMENT
2,hugh grant marri first time age 57 actor longt...,ENTERTAINMENT
3,jim carrey blast castrato adam schiff democrat...,ENTERTAINMENT
4,julianna marguli use donald trump poop bag pic...,ENTERTAINMENT
...,...,...
162142,rim ceo thorsten hein signific plan blackberri...,TECH
162143,maria sharapova stun victoria azarenka austral...,SPORTS
162144,giant patriot jet colt among improb super bow...,SPORTS
162145,aldon smith arrest 49er lineback bust dui corr...,SPORTS


In [108]:
# Reduce number of lines (123 GiB needed else...)
def reduce_df(df, fraction):
    df_reduced = pd.DataFrame(columns=['parsed_text','category'])
    for cat in set(df_full['category']):
        # Randomly sample x% of your dataframe
        df_reduced = df_reduced.append(df[df['category'] == cat].sample(frac=fraction))
    return df_reduced.reset_index()

df_parsed = reduce_df(df_full,0.1)
df_parsed

Unnamed: 0,index,parsed_text,category
0,155157,kathi griffin divorc comedian talk tattoo wed ...,DIVORCE
1,120377,live world need help liter stuck foyer two ent...,DIVORCE
2,101466,difficult part second marriag us betray past t...,DIVORCE
3,118563,divorc women truth husband 401k asset marriag ...,DIVORCE
4,157378,perfect divorc toolkit divorc toolkit use hamm...,DIVORCE
...,...,...,...
16211,75408,eight delici way day matzah shakshuka mani rec...,FOOD & DRINK
16212,101956,sanitari cute way dri silverwar form ceram eleph,FOOD & DRINK
16213,110899,eat tell peopl around photo gem need get away ...,FOOD & DRINK
16214,149754,make cocktail without recip finish touch top c...,FOOD & DRINK


## Transform data to numerical
### Reduce dataset size

In [109]:
df_parsed.drop('index',axis=1, inplace=True)
df_parsed.shape

(16216, 2)

In [110]:
# Check number of points per category
df_parsed.groupby('category').size()

category
ARTS & CULTURE     388
BUSINESS           594
CRIME              340
DIVORCE            343
EDUCATION          215
ENTERTAINMENT     1606
ENVIRONMENT        394
FOOD & DRINK       832
HOME & LIVING     1089
MEDIA              282
MONEY              171
POLITICS          3274
RELIGION           256
SCIENCE            218
SPORTS             488
STYLE & BEAUTY    1190
TECH               208
TRAVEL             989
WEDDINGS           365
WELLNESS          1783
WOMEN              349
WORLD NEWS         842
dtype: int64

In [111]:
# Check for NaN and drop them
print(df_parsed.isna().sum())
df_parsed = df_parsed.dropna()

parsed_text    0
category       0
dtype: int64


#### Check for most common words for a category

In [112]:
# Loop over all the words in all the texts and increment the counts in the appropriate counter objects
def count_most_common(cat):
    count_word = Counter()
    df_cat = df_parsed[df_parsed['category'] == cat]
    for article in df_cat['parsed_text']:
        for word in article.split(" "):
            count_word[word] += 1
    return count_word.most_common(100)

In [113]:
#count_most_common('TECH')

In [114]:
%%time
def keep_only_most_common(x):
    word_count = count_most_common(x['category'])
    word_list = [tpl[0] for tpl in word_count]
    keep_word = [word for word in x['parsed_text'].split(" ") if word in word_list]
    return " ".join(keep_word)

df_parsed['parsed_text'] = df_parsed.apply(keep_only_most_common, axis=1)
df_parsed

Wall time: 6min 7s


Unnamed: 0,parsed_text,category
0,divorc talk wed one like wed divorc,DIVORCE
1,live need help two one,DIVORCE
2,marriag us,DIVORCE
3,divorc women husband marriag divorc time work ...,DIVORCE
4,divorc divorc thing life divorc life,DIVORCE
...,...,...
16211,delici way day mani recip dish way make say be...,FOOD & DRINK
16212,way,FOOD & DRINK
16213,eat peopl photo need get,FOOD & DRINK
16214,make cocktail recip top cocktail,FOOD & DRINK


In [116]:
df_parsed = df_parsed[df_parsed['parsed_text'] != '']
df_parsed.head()

Unnamed: 0,parsed_text,category
0,divorc talk wed one like wed divorc,DIVORCE
1,live need help two one,DIVORCE
2,marriag us,DIVORCE
3,divorc women husband marriag divorc time work ...,DIVORCE
4,divorc divorc thing life divorc life,DIVORCE


### Extract features and labels

In [117]:
# List of features and labels
parsed_text = list(df_parsed['parsed_text'])
categories = list(df_parsed['category'])
set(categories)

{'ARTS & CULTURE',
 'BUSINESS',
 'CRIME',
 'DIVORCE',
 'EDUCATION',
 'ENTERTAINMENT',
 'ENVIRONMENT',
 'FOOD & DRINK',
 'HOME & LIVING',
 'MEDIA',
 'MONEY',
 'POLITICS',
 'RELIGION',
 'SCIENCE',
 'SPORTS',
 'STYLE & BEAUTY',
 'TECH',
 'TRAVEL',
 'WEDDINGS',
 'WELLNESS',
 'WOMEN',
 'WORLD NEWS'}

#### Features

In [118]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(parsed_text)
X = X.toarray()

In [119]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [120]:
words = vectorizer.get_feature_names()
words[100:110]

['care',
 'carpet',
 'case',
 'cast',
 'catch',
 'cathol',
 'caus',
 'celebr',
 'ceo',
 'challeng']

In [121]:
# dump classifier in a pickle file
with open("vectorizer.pkl", 'wb') as file:
    pk.dump(vectorizer, file)

In [122]:
print(X.shape)

(15552, 814)


#### Labels

In [123]:
# Convert categrory words into integers
word_to_int = dict((c, i) for i, c in enumerate(set(df_parsed['category'])))
labels = [word_to_int[cat] for cat in categories]
y = np.asarray(labels)
y

array([ 0,  0,  0, ..., 21, 21, 21])

In [124]:
int_to_word = {v: k for k, v in word_to_int.items()}
int_to_word

{0: 'DIVORCE',
 1: 'CRIME',
 2: 'MONEY',
 3: 'HOME & LIVING',
 4: 'WORLD NEWS',
 5: 'WELLNESS',
 6: 'RELIGION',
 7: 'WEDDINGS',
 8: 'TECH',
 9: 'ENVIRONMENT',
 10: 'ENTERTAINMENT',
 11: 'SPORTS',
 12: 'SCIENCE',
 13: 'WOMEN',
 14: 'EDUCATION',
 15: 'TRAVEL',
 16: 'POLITICS',
 17: 'BUSINESS',
 18: 'MEDIA',
 19: 'STYLE & BEAUTY',
 20: 'ARTS & CULTURE',
 21: 'FOOD & DRINK'}

In [125]:
# dump categories in a pickle file
with open("categ.pkl", 'wb') as file:
    pk.dump((int_to_word), file)

## Machine learning
### Genetrate train and test datasets

In [126]:
# create train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=None)

In [127]:
# dump the sets to pkl files in a pickle file
with open("X_train.pkl", 'wb') as file:
    pk.dump((X_train), file)
    
with open("X_test.pkl", 'wb') as file:
    pk.dump((X_test), file)

with open("y_train.pkl", 'wb') as file:
    pk.dump((y_train), file)
    
with open("y_test.pkl", 'wb') as file:
    pk.dump((y_test), file)

### Checkpoint for machine learning

In [128]:
with open("X_train.pkl", 'rb') as fid:
    X_train = pk.load(fid)

with open("X_test.pkl", 'rb') as fid:
    X_test = pk.load(fid)

with open("y_train.pkl", 'rb') as fid:
    y_train = pk.load(fid)
    
with open("y_test.pkl", 'rb') as fid:
    y_test = pk.load(fid)

In [129]:
X_train.shape

(12441, 814)

In [130]:
y_train.shape

(12441,)

In [131]:
# cross-validator : ShuffleSplit 
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 42) # To avoid over-fitting

### Import classifier ###
from sklearn.svm import LinearSVC
clf = LinearSVC()

# definition of the pipeline
pipeline = Pipeline(steps = [
    ("LSVC",clf)
])

# parameters to tune 
param_grid = {
    'LSVC__C' : [1],
    'LSVC__class_weight' : ['balanced'],
    'LSVC__multi_class' : ['ovr'],
    'LSVC__random_state' : [42],
    'LSVC__max_iter' : [10000],
}  

# exhaustive search over specified parameter
grid = GridSearchCV(pipeline, param_grid, verbose = 1, cv = sss)

In [132]:
# training classifier
print (" > training classifier:")
t0 = time()
grid.fit(X_train, y_train)
print ("training time: ", round(time()-t0, 3), "s")

# best classifier using the cross-validator and the Stratified Shuffle Split 
clf = grid.best_estimator_

 > training classifier:
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   31.7s finished


training time:  36.199 s


In [133]:
%%time
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87        68
           1       0.89      0.85      0.87        66
           2       0.63      0.91      0.75        34
           3       0.67      0.66      0.66       205
           4       0.79      0.80      0.79       157
           5       0.83      0.79      0.81       352
           6       0.76      0.78      0.77        49
           7       0.96      0.89      0.92        73
           8       0.80      0.80      0.80        41
           9       0.84      0.68      0.75        76
          10       0.70      0.82      0.76       290
          11       0.85      0.77      0.81        92
          12       0.52      0.64      0.57        42
          13       0.66      0.72      0.69        67
          14       0.69      0.88      0.77        41
          15       0.92      0.83      0.88       193
          16       0.91      0.88      0.89       628
          17       0.69    

In [134]:
# dump classifier in a pickle file
with open("classifier.pkl", 'wb') as file:
    pk.dump(clf, file)

## Test model

In [None]:
import json
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle as pk

from ipywidgets import IntProgress
from IPython.display import display
import time
from time import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [13]:
import pandas as pd
import pickle as pk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [14]:
# Load vectorizer
with open("vectorizer.pkl", 'rb') as fid:
    vectorizer_trained = pk.load(fid)

# Load categories
with open("categ.pkl", 'rb') as fid:
    categ = pk.load(fid)
    
# Load categories
with open("classifier.pkl", 'rb') as fid:
    trained_clf = pk.load(fid)

# Parsing text + cleaning + stemm
def parse_out_text(all_text):
    # clean punctuation, make lower case and remove stopwords
    text_string = all_text.translate(str.maketrans("", "", string.punctuation)).split(" ")
    text_string = [word.lower() for word in text_string if word.lower() not in stopwords.words('english')]
    # Stemm text
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for  word in text_string]
    words = " ".join(stemmed) 
    return words

In [21]:
text_to_predict = 'If you think you might be ready to fly again, we can help plan your trip'
text_to_predict = 'Google just created the most detailed image of a brain yet'
text_to_predict = 'The mysterious skeleton of a woman who died more than 4,000 years ago has been discovered in Germany.'

In [22]:
prepared_text = parse_out_text(text_to_predict)
feat_array = vectorizer_trained.transform([prepared_text]).toarray()
result_int = trained_clf.predict(feat_array)[0]
output = categ[result_int]

In [23]:
output

'SCIENCE'