In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
v = CountVectorizer(ngram_range=(1, 1)) # we make bag of word model with CountVectorizer
                                        # ngram_range parameter is 1,1 by default. 

In [6]:
v.fit(["Thor Hathodawala is looking for a job"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [8]:
v = CountVectorizer(ngram_range=(2, 2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [9]:
v = CountVectorizer(ngram_range=(1, 2)) # combining 1-gram and 2-gram
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [10]:
v = CountVectorizer(ngram_range=(1, 3)) # combining 1-gram, 2-gram and 3-gram
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [154]:
corpus = ["Thor ate pizza", "Loki is tall", "Loki is eating pizza"]

In [64]:
import spacy

In [65]:
#load english language model and create nlp object from it. 

nlp = spacy.load("en_core_web_sm")

In [106]:
doc = nlp(corpus)

In [123]:
def preprocess(text):
    
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if not token.is_stop:
            filtered_tokens.append(token)           

In [124]:
preprocess("Loki is eating Pizza")

In [125]:
doc

Thor ate pizza, Loki is tall, Loki is eating pizza

In [126]:
[token for token in doc if not token.is_stop]

[Thor, ate, pizza, ,, Loki, tall, ,, Loki, eating, pizza]

In [129]:
doc[5].is_stop

True

In [136]:
[token for token in doc if not token.is_punct]

[Thor, ate, pizza, Loki, is, tall, Loki, is, eating, pizza]

In [137]:
[token for token in doc if not token.is_stop]

[Thor, ate, pizza, ,, Loki, tall, ,, Loki, eating, pizza]

In [142]:
[token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

['Thor', 'eat', 'pizza', 'Loki', 'tall', 'Loki', 'eat', 'pizza']

### preprocessing function

In [150]:
import spacy

#Load english language model and create nlp object from it

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text) # we just create document out of text. sPacy will give all the token
    
    filtered_tokens = []
     
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)   

In [151]:
preprocess("Thor ate pizza")

'Thor eat pizza'

In [152]:
preprocess("Thor is eating pizza")

'Thor eat pizza'

In [158]:
corpus_processed= [preprocess(text) for text in corpus]
corpus_processed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

### Generating vocabulary matrice by fitting

In [157]:
v = CountVectorizer(ngram_range=(1,2))

In [159]:
v.fit(corpus_processed) # when we fit, it will generate vocabulary. 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [160]:
v.vocabulary_ #numbers show index of matrice

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

### converting new texts to vector for ML 

In [161]:
v.transform(["Thor eat pizza"])

<1x9 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [163]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [165]:
v.transform(["Hulk eat pizza"]).toarray() # in our fitted vocab there was no hulk-Out of vocabulary problem-OOV

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

### news category 

### common nlp problem- auto-tagging category

### Assigning categories to new text/sentiments

In [166]:
import pandas as pd

In [193]:
df = pd.read_json("C:/Users/Owner/nlp-tutorials/11_bag_of_n_grams/news_dataset.json")

In [194]:
df.shape

(12695, 2)

In [195]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [196]:
df.category.value_counts() # we see the balance of dataset

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [197]:
# Handling imbalance dataset: oversampling, undersampling-most simple etc.  
# You ignore more than minimum counts(1381) 
#in real life, wasting training data is sin. for training purposes we do undersampling. 

In [198]:
min_samples = 1381

df_business = df[df.category == "BUSINESS"].sample(min_samples, random_state=2022) # random_state: provides consistency in case you re-run the notebook multiple times. 
df_sports = df[df.category == "SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category == "CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category == "SCIENCE"].sample(min_samples, random_state=2022)

In [199]:
df_balanced = pd.concat([df_business,df_sports, df_crime, df_science], axis=0)
df_balanced.category.value_counts()

SPORTS      1381
BUSINESS    1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [200]:
my_dict = {"BUSINESS":1, "SPORTS":2, "CRIME":3, "SCIENCE":4}

df_balanced.category = df_balanced.category.map(my_dict)
df_balanced.head()

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,1
2912,From the Other Side; an Honest Review from Emp...,1
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",1
502,How to Market Your Business While Traveling th...,1
5279,How to Leverage Intuition in Decision-making I...,1


In [202]:
from sklearn.model_selection import train_test_split

In [204]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced["text"], df_balanced["category"], 
                                                    test_size= 0.2,
                                                   random_state=2022,
                                                   stratify =df_balanced.category
                                                   ) # it will generate equal numbers of all classes in train and test. 

In [205]:
X_train.shape

(4419,)

In [206]:
X_train.head()

7589     Ovulating Women Prefer Images of Penetration O...
10442    Scientists Discover Spooky Influence On Baby N...
8792     Olympic Race Walker Steps Up To Propose To His...
1733     Beloved Bipedal Bear Named Pedals Believed Kil...
2526     Elizabeth Smart Gave Birth To Baby Girl, Fathe...
Name: text, dtype: object

In [208]:
y_train.value_counts() # stratify parametre did tis job, we prevent bias, model will treat each sample in balanced way. 

3    1105
1    1105
4    1105
2    1104
Name: category, dtype: int64

### building BOW Model

In [209]:
from sklearn.feature_extraction.text import CountVectorizer

In [210]:
from sklearn.naive_bayes import MultinomialNB # naive bayes is recommended for text based problems

In [212]:
from sklearn.pipeline import Pipeline

In [213]:
from sklearn.metrics import classification_report

In [216]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer()), #simple text vectorizer, bow: n=1 gram
    ("Multi NB", MultinomialNB()) #classifier
])


clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           1       0.75      0.87      0.81       276
           2       0.93      0.80      0.86       277
           3       0.83      0.90      0.86       276
           4       0.90      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [217]:
clf = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1,2))), #simple text vectorizer-bi-gram: performance is lower, use simple bow
    ("Multi NB", MultinomialNB()) #classifier
])


clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           1       0.69      0.90      0.78       276
           2       0.95      0.74      0.83       277
           3       0.82      0.88      0.85       276
           4       0.92      0.78      0.84       276

    accuracy                           0.82      1105
   macro avg       0.85      0.82      0.83      1105
weighted avg       0.85      0.82      0.83      1105



In [218]:
X_test[:5]

3716     African Nation Slaps Exxon With Fine Nearly 7 ...
608      These Cringe-Worthy Stories Show It Can Be Har...
11172    LISTEN: The Accidental Discovery That Proved T...
1346     Build Loyalty -- The Cost -- $00.00 Remember y...
1356     Man Killed By Michigan Police Wasn't Targeting...
Name: text, dtype: object

In [219]:
y_test[:5]

3716     1
608      4
11172    4
1346     1
1356     3
Name: category, dtype: int64

In [222]:
y_preds[:5]

array([1, 1, 4, 1, 3], dtype=int64)

{"BUSINESS":1, "SPORTS":2, "CRIME":3, "SCIENCE":4}

In [225]:
clf.predict(["Jordan scored high"])

array([2], dtype=int64)