# TF-IDF by `Mr. Harshit Dawar!`

In [1]:
import math
import goose3
import spacy
import nltk

In [2]:
article = goose3.Goose().extract("https://en.wikipedia.org/wiki/Natural_language_processing")

In [3]:
sentences = [sentence for sentence in nltk.sent_tokenize(article.cleaned_text)]

In [4]:
sentences

['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Natural language processing has its roots in the 1950s.',
 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.',

## Processing the Data

In [5]:
English_Model = spacy.load("en_core_web_sm", disable = ["ner", "parser"])

In [6]:
def Process_Data(sentence):
    sen = English_Model(sentence)
    
    temp = []
    
    for token in sen: 
        if not token.is_stop and not token.is_punct and not token.like_num and not token.is_space:
            temp_word = token.lemma_
            temp.append(temp_word.lower())
            
            
    return " ".join(temp)

In [7]:
processed_sentences = []

for sentence in sentences:
    processed_sentences.append(Process_Data(sentence))

In [8]:
processed_sentences

['natural language processing nlp subfield linguistic computer science artificial intelligence concern interaction computer human language particular program computer process analyze large amount natural language datum',
 'goal computer capable understand content document include contextual nuance language',
 'technology accurately extract information insight contain document categorize organize document',
 'natural language processing root 1950s',
 'alan turing publish article title computing machinery intelligence propose call turing test criterion intelligence task involve automated interpretation generation natural language time articulate problem separate artificial intelligence',
 'premise symbolic nlp summarize john searle chinese room experiment give collection rule e.g. chinese phrasebook question match answer computer emulate natural language understanding nlp task apply rule datum confront',
 '1950 georgetown experiment involve fully automatic translation russian sentence en

## TF-IDF Implementation

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vectorizer = TfidfVectorizer()

In [12]:
vectorized_sentences = vectorizer.fit_transform(processed_sentences)

In [14]:
vectorized_sentences.toarray().shape

(87, 624)

In [17]:
vectorized_sentences.toarray()[1]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [19]:
vectorizer.idf_.shape

(624,)

In [20]:
# Getting the count of each word in the processed sentences
vectorizer.vocabulary_

{'natural': 374,
 'language': 316,
 'processing': 439,
 'nlp': 380,
 'subfield': 544,
 'linguistic': 334,
 'computer': 123,
 'science': 496,
 'artificial': 68,
 'intelligence': 296,
 'concern': 126,
 'interaction': 299,
 'human': 272,
 'particular': 407,
 'program': 441,
 'process': 438,
 'analyze': 52,
 'large': 317,
 'amount': 50,
 'datum': 150,
 'goal': 247,
 'capable': 90,
 'understand': 598,
 'content': 135,
 'document': 174,
 'include': 283,
 'contextual': 138,
 'nuance': 387,
 'technology': 565,
 'accurately': 31,
 'extract': 214,
 'information': 288,
 'insight': 291,
 'contain': 134,
 'categorize': 96,
 'organize': 399,
 'root': 491,
 '1950s': 11,
 'alan': 44,
 'turing': 590,
 'publish': 452,
 'article': 66,
 'title': 577,
 'computing': 124,
 'machinery': 342,
 'propose': 447,
 'call': 88,
 'test': 568,
 'criterion': 146,
 'task': 560,
 'involve': 308,
 'automated': 73,
 'interpretation': 304,
 'generation': 239,
 'time': 576,
 'articulate': 67,
 'problem': 435,
 'separate': 50

In [21]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [26]:
vectorized_sentences.toarray()[1].argmax()

90

In [27]:
vectorizer.get_feature_names()[90]

'capable'