<a href="https://colab.research.google.com/github/Nekokan1500/Machine-Learning/blob/main/NLP/Example_Text_Mining_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Tokenizing input text

lines = [
'How to tokenize?\nLike a boss.',
'Google is accessible via http://www.google.com',
'1000 new followers! #TwitterFamous']

# Tokenizing with string split
for line in lines:
  print(line.split())

['How', 'to', 'tokenize?', 'Like', 'a', 'boss.']
['Google', 'is', 'accessible', 'via', 'http://www.google.com']
['1000', 'new', 'followers!', '#TwitterFamous']


In [2]:
# Tokenizing using regular expressions
import re

# "\w+", refers to any sequence of one or more alphanumeric characters or underscores.
_token_pattern = r"\w+"
token_pattern = re.compile(_token_pattern)

for line in lines:
  print(token_pattern.findall(line))

['How', 'to', 'tokenize', 'Like', 'a', 'boss']
['Google', 'is', 'accessible', 'via', 'http', 'www', 'google', 'com']
['1000', 'new', 'followers', 'TwitterFamous']


In [3]:
# Using placeholders before tokenizing
def tokenizer(line):
  line = line.lower()
  line = re.sub(r'http[s]?://[\w\/\-\.?]+','_url_',line)
  line = re.sub(r'#\w+', '_hashtag_', line)
  line = re.sub(r'\d+', '_num_', line)
  return token_pattern.findall(line)

for line in lines:
  print(tokenizer(line))

['how', 'to', 'tokenize', 'like', 'a', 'boss']
['google', 'is', 'accessible', 'via', '_url_']
['_num_', 'new', 'followers', '_hashtag_']


In [16]:
# Vectorization
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vec = CountVectorizer(lowercase=True, tokenizer=tokenizer)
x = vec.fit_transform(lines)
pd.DataFrame(x.todense(),columns=vec.get_feature_names_out())

Unnamed: 0,_hashtag_,_num_,_url_,a,accessible,boss,followers,google,how,is,like,new,to,tokenize,via
0,0,0,0,1,0,1,0,0,1,0,1,0,1,1,0
1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1
2,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0


In [17]:
# Limitation of bag of words method
flight_delayed_lines = [
'Flight was delayed, I am not happy',
'Flight was not delayed, I am happy']

x1 = vec.fit_transform(flight_delayed_lines)
pd.DataFrame(x1.todense(),columns=vec.get_feature_names_out())


Unnamed: 0,am,delayed,flight,happy,i,not,was
0,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1


In [15]:
# N-grams
vec = CountVectorizer(ngram_range=(2,2)) # bigrams only
x2 = vec.fit_transform(flight_delayed_lines)
pd.DataFrame(x2.todense(),columns=vec.get_feature_names_out())

Unnamed: 0,am happy,am not,delayed am,flight was,not delayed,not happy,was delayed,was not
0,0,1,1,1,0,1,1,0
1,1,0,1,1,1,0,0,1


In [18]:
# Vectorization based on characters

vec = CountVectorizer(analyzer='char', ngram_range=(4,4))
x3 = vec.fit_transform(flight_delayed_lines)
pd.DataFrame(x3.todense(),columns=vec.get_feature_names_out())

Unnamed: 0,am,del,hap,i a,not,was,", i",am h,am n,appy,as d,as n,ayed,"d, i",dela,"ed,",elay,flig,ght,happ,ht w,i am,ight,laye,ligh,m ha,m no,not.1,ot d,ot h,s de,s no,t de,t ha,t wa,was.1,"yed,"
0,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,1,1,1
1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,1,1,1


In [19]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

lines_fruits = [
'I like apples',
'I like oranges',
'I like pears']

vec = TfidfVectorizer(token_pattern=r'\w+')
x = vec.fit_transform(lines_fruits)
pd.DataFrame(x.todense(),columns=vec.get_feature_names_out())

Unnamed: 0,apples,i,like,oranges,pears
0,0.767495,0.453295,0.453295,0.0,0.0
1,0.0,0.453295,0.453295,0.767495,0.0
2,0.0,0.453295,0.453295,0.0,0.767495


In [1]:
import spacy

nlp = spacy.load('en_core_web_lg')
terms = ['I','like','apples','oragnes','pears']