# Bag Of Words by `Mr. Harshit Dawar!`

In [1]:
import goose3
import nltk
import spacy

In [2]:
article = goose3.Goose().extract("https://en.wikipedia.org/wiki/Natural_language_processing")

In [3]:
article.cleaned_text

'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\n\nNatural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.\n\nThe premise 

In [4]:
sentences = [sentence for sentence in nltk.sent_tokenize(article.cleaned_text)]

In [5]:
sentences

['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Natural language processing has its roots in the 1950s.',
 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.',

In [6]:
sentences[0]

'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.'

## Preprocessing the Data

In [47]:
English_Model = spacy.load("en_core_web_sm", disable = ["ner", "parser"])

In [105]:
def Process_Data(sentence):
    sen = English_Model(sentence)
    
    temp = []
    
    for token in sen: 
        if not token.is_stop and not token.is_punct and not token.like_num and not token.is_space:
            temp_word = token.lemma_
            temp.append(temp_word.lower())
            
            
    return " ".join(temp)

In [106]:
processed_sentences = []

for sentence in sentences:
    processed_sentences.append(Process_Data(sentence))

In [107]:
processed_sentences

['natural language processing nlp subfield linguistic computer science artificial intelligence concern interaction computer human language particular program computer process analyze large amount natural language datum',
 'goal computer capable understand content document include contextual nuance language',
 'technology accurately extract information insight contain document categorize organize document',
 'natural language processing root 1950s',
 'alan turing publish article title computing machinery intelligence propose call turing test criterion intelligence task involve automated interpretation generation natural language time articulate problem separate artificial intelligence',
 'premise symbolic nlp summarize john searle chinese room experiment give collection rule e.g. chinese phrasebook question match answer computer emulate natural language understanding nlp task apply rule datum confront',
 '1950 georgetown experiment involve fully automatic translation russian sentence en

In [58]:
len(processed_sentences)

87

## Bag Of Words/Count Vectorization Creation!

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [108]:
vectorizer = CountVectorizer()

In [109]:
vectorized_sentences = vectorizer.fit_transform(processed_sentences)

In [110]:
vectorized_sentences

<87x624 sparse matrix of type '<class 'numpy.int64'>'
	with 1231 stored elements in Compressed Sparse Row format>

In [111]:
vectorized_sentences.toarray().shape

(87, 624)

In [112]:
vectorized_sentences.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,

In [113]:
processed_sentences[0]

'natural language processing nlp subfield linguistic computer science artificial intelligence concern interaction computer human language particular program computer process analyze large amount natural language datum'

In [114]:
# Getting the unique features/words
len(vectorizer.get_feature_names())

624

In [115]:
import numpy as np

In [116]:
np.unique(vectorized_sentences.toarray()[0], return_counts = True)

(array([0, 1, 2, 3]), array([604,  17,   1,   2]))

In [117]:
for i in range(len(vectorized_sentences.toarray()[0])):
    if vectorized_sentences.toarray()[0][i] == 3:
        print(i)
        
for i in range(len(vectorized_sentences.toarray()[0])):
    if vectorized_sentences.toarray()[0][i] == 2:
        print(i)

123
316
374


In [118]:
vectorizer.get_feature_names()[123], vectorizer.get_feature_names()[316], vectorizer.get_feature_names()[374]

('computer', 'language', 'natural')