# Feature Extraction

- Transform text to a numerical features
- feature-extraction-techniques:
    - Bag ofWords
    - N-Grams
    - TF-IDF   

## Required_libraries

In [9]:
#pip install pandas
#pip install sklearn

## Bag of Words

Count the occurrences of words in the corpus.

In [13]:
# With stop words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
texts = ["good movie", "not a good movie", "did not like"]
vectorizer = CountVectorizer()
vectorizer.fit(texts)
x = vectorizer.transform(texts)
columns = vectorizer.get_feature_names()
pd.DataFrame(x.todense(), columns=columns, index=texts)



Unnamed: 0,did,good,like,movie,not
good movie,0,1,0,1,0
not a good movie,0,1,0,1,1
did not like,1,0,1,0,1


In [14]:
# Without stop words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
texts = ["good movie", "not a good movie", "did not like"]
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(texts)
x = vectorizer.transform(texts)
columns = vectorizer.get_feature_names()
pd.DataFrame(x.todense(), columns=columns, index=texts)



Unnamed: 0,did,good,like,movie
good movie,0,1,0,1
not a good movie,0,1,0,1
did not like,1,0,1,0


## N-Grams

In [17]:
# N = 1
texts = ["good movie", "not a good movie", "did not like"]
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
vectorizer.fit(texts)
x = vectorizer.transform(texts)
columns = vectorizer.get_feature_names()
pd.DataFrame(x.todense(), columns=columns, index=texts)



Unnamed: 0,did,good,like,movie
good movie,0,1,0,1
not a good movie,0,1,0,1
did not like,1,0,1,0


In [18]:
# N = 2
texts = ["good movie", "not a good movie", "did not like"]
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer.fit(texts)
x = vectorizer.transform(texts)
columns = vectorizer.get_feature_names()
pd.DataFrame(x.todense(), columns=columns, index=texts)



Unnamed: 0,did,did like,good,good movie,like,movie
good movie,0,0,1,1,0,1
not a good movie,0,0,1,1,0,1
did not like,1,1,0,0,1,0


## TF-IDF

- TF-IDF stands for term frequency-inverse document frequency.
- It is composed of 2 sub-parts, which are :
    - Term Frequency (TF)
    - Inverse Document Frequency (IDF)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["good movie", "not a good movie", "did not like"]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer.fit(texts)
x = vectorizer.transform(texts)
columns = vectorizer.get_feature_names()
pd.DataFrame(x.todense(), columns=columns, index=texts)



Unnamed: 0,did,did like,good,good movie,like,movie
good movie,0.0,0.0,0.57735,0.57735,0.0,0.57735
not a good movie,0.0,0.0,0.57735,0.57735,0.0,0.57735
did not like,0.57735,0.57735,0.0,0.0,0.57735,0.0
