In [4]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range = (1,2))
v.fit(['Thor Hathodwala is looking for a job'])
v.vocabulary_

{'thor': 9,
 'hathodwala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodwala': 10,
 'hathodwala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [5]:
corpus = [
    'Thor ate pizza',
    'Loki is tall',
    'Loki is eating pizza'
]

In [7]:
import spacy

#Load English model and create a nlp object from it
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)

preprocess('Loki is eating pizza')
            

'Loki eat pizza'

In [9]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [10]:
v = CountVectorizer(ngram_range = (1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
v.transform(['Thor eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [12]:
v.transform(['Hulk eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [13]:
import pandas as pd

df = pd.read_json('news_dataset.json')

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [14]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [17]:
#to tackle class imbalance issues
min_samples = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_samples, random_state = 1)
df_sports = df[df.category == 'SPORTS'].sample(min_samples, random_state = 1)
df_crime = df[df.category == 'CRIME'].sample(min_samples, random_state = 1)
df_science = df[df.category == 'SCIENCE'].sample(min_samples, random_state = 1)

In [21]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science], axis = 0)
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [22]:
df_balanced['category_num'] = df_balanced.category.map({
    
    'BUSINESS': 0,
    'SPORTS': 1,
    'CRIME': 2,
    'SCIENCE': 3
})

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        df_balanced.text,
        df_balanced.category_num,
        test_size = 0.2,
        random_state = 1,
        stratify = df_balanced.category_num
)

In [24]:
print(X_train.shape)
X_train.head()

(4419,)


5471    Stanford Wins Rose Bowl With 45-16 Victory Ove...
9214    Waymo Says Uber Stole Critical Self-Driving Te...
1770    The NHL Should Not Make Definitive Statements ...
4794    Police Raid Pot Club Of Reporter Who Quit Her ...
7720               SOLVED? Massive Magnetic Star Mystery 
Name: text, dtype: object

In [25]:
y_test.value_counts()

1    277
3    276
0    276
2    276
Name: category_num, dtype: int64