# Natural Language Processing

In [1]:
var = "Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing. computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics."

In [2]:
var

'Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing. computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics.'

## Tokenization
### Word Tokenization & Sentence Tokenization

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
sent = sent_tokenize(var)

In [5]:
sent

['Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence.',
 'It is primarily concerned with providing.',
 'computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics.']

In [6]:
for i in sent:
    print(i)

Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence.
It is primarily concerned with providing.
computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics.


In [7]:
word = word_tokenize(var)

## Stop Word Removal

In [8]:
from nltk.corpus import stopwords
from string import punctuation

In [9]:
stop = stopwords.words("english")

In [10]:
stop_word_list = list(punctuation)+stop

In [11]:
new_var = []
for i in word:
    if i not in stop_word_list:
        new_var.append(i)


## Stemming and Lemmatization
### Stemming

In [12]:
from nltk.stem import LancasterStemmer, RegexpStemmer, PorterStemmer, SnowballStemmer

In [13]:
l = LancasterStemmer()
r = RegexpStemmer('ing')
p = PorterStemmer()
s = SnowballStemmer('english')

In [14]:
l.stem('changing')

'chang'

In [15]:
r.stem('changing')

'chang'

In [16]:
p.stem('changing')

'chang'

In [17]:
s.stem('changing')

'chang'

### Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
wl = WordNetLemmatizer()

In [20]:
wl.lemmatize('mice')

'mouse'

## N - Grams

In [64]:
x = 'i am Haroon Rasheed i am computer science student i am good boy i am best'

In [65]:
from nltk.tokenize import word_tokenize

In [66]:
w = word_tokenize(x)

In [67]:
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, ngrams

In [68]:
b = BigramCollocationFinder.from_words(w)
t = TrigramCollocationFinder.from_words(w)
n = ngrams(w,4)

In [69]:
b.ngram_fd

FreqDist({('i', 'am'): 4, ('am', 'Haroon'): 1, ('Haroon', 'Rasheed'): 1, ('Rasheed', 'i'): 1, ('am', 'computer'): 1, ('computer', 'science'): 1, ('science', 'student'): 1, ('student', 'i'): 1, ('am', 'good'): 1, ('good', 'boy'): 1, ...})

In [70]:
t.ngram_fd

FreqDist({('i', 'am', 'Haroon'): 1, ('am', 'Haroon', 'Rasheed'): 1, ('Haroon', 'Rasheed', 'i'): 1, ('Rasheed', 'i', 'am'): 1, ('i', 'am', 'computer'): 1, ('am', 'computer', 'science'): 1, ('computer', 'science', 'student'): 1, ('science', 'student', 'i'): 1, ('student', 'i', 'am'): 1, ('i', 'am', 'good'): 1, ...})

In [71]:
for i in n:
    print(i)

('i', 'am', 'Haroon', 'Rasheed')
('am', 'Haroon', 'Rasheed', 'i')
('Haroon', 'Rasheed', 'i', 'am')
('Rasheed', 'i', 'am', 'computer')
('i', 'am', 'computer', 'science')
('am', 'computer', 'science', 'student')
('computer', 'science', 'student', 'i')
('science', 'student', 'i', 'am')
('student', 'i', 'am', 'good')
('i', 'am', 'good', 'boy')
('am', 'good', 'boy', 'i')
('good', 'boy', 'i', 'am')
('boy', 'i', 'am', 'best')


## Count Vectorizer

In [72]:
l = ['i am Haroon Rasheed', 'i am computer science student', 'i am good boy', 'i am best']

In [73]:
import pandas as pd

In [74]:
df = pd.DataFrame({'name': l})

In [75]:
df

Unnamed: 0,name
0,i am Haroon Rasheed
1,i am computer science student
2,i am good boy
3,i am best


In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
cv = CountVectorizer()

In [81]:
new_df = cv.fit_transform(df['name']).toarray()

In [82]:
new_df

array([[1, 0, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 1, 1],
       [1, 0, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [84]:
cv.vocabulary_

{'am': 0,
 'haroon': 5,
 'rasheed': 6,
 'computer': 3,
 'science': 7,
 'student': 8,
 'good': 4,
 'boy': 2,
 'best': 1}

## Word Sense Disambiguation

In [100]:
x = "Sunrise (or sunup) is the moment when the upper rim of the Sun appears on the horizon in the morning,[1] at the start of the Sun path. The term can also refer to the entire process of the solar disk crossing the horizon."

In [101]:
y = "A computer mouse (plural mice; also mouses) is a hand-held pointing device that detects two-dimensional motion relative to a surface. This motion is typically translated into the motion of the pointer (called a cursor) on a display, which allows a smooth control of the graphical user interface of a computer."

In [102]:
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

In [103]:
l = lesk(word_tokenize(y), 'mouse')

In [99]:
l.definition()

'a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that rolls on the surface of the pad'