<a href="https://colab.research.google.com/github/LameesKadhim/NLTK-library-python/blob/main/nltk_library_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **NLTK is a popular python library which is used for natural language processing NLP**

### **seperating a text to a group of sentences or words**

In [1]:
pip install nltk



In [2]:
import nltk

In [3]:


#seperate the sentences using the dots between them
nltk.download('punkt')

#seperate by sentence
from nltk import sent_tokenize
mytext = 'Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you.'
print('\nsent_tokenize: ', sent_tokenize(mytext))

#seperate by words
from nltk.tokenize import word_tokenize
mytext = 'Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you.'
print('\nword_tokenize', word_tokenize(mytext))

#using language other than english
mytext = 'Hallo Herr Adam, wie geht es Ihnen? Ich hoffe alles läuft gut. Heute ist ein guter Tag, wir sehen uns.'
print('\nsent_tokenize', sent_tokenize(mytext,'german'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

sent_tokenize:  ['Hello Mr. Adam, how are you?', 'I hope everything is going well.', 'Today is a good day, see you.']

word_tokenize ['Hello', 'Mr.', 'Adam', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'everything', 'is', 'going', 'well', '.', 'Today', 'is', 'a', 'good', 'day', ',', 'see', 'you', '.']

sent_tokenize ['Hallo Herr Adam, wie geht es Ihnen?', 'Ich hoffe alles läuft gut.', 'Heute ist ein guter Tag, wir sehen uns.']


### **Definition, antonyms, synonymous & stemming**

In [4]:
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
#get definition of word
syn  = wordnet.synsets('pain')
print('pain is', syn[2].definition())
print('an example is ', syn[2].examples())
print('--------------------------------------------')

syn  = wordnet.synsets('NLP')
print('NLP is', syn[0].definition())
print('--------------------------------------------')

pain is a somatic sensation of acute discomfort
an example is  ['as the intensity increased the sensation changed from tickle to pain']
--------------------------------------------
NLP is the branch of information science that deals with natural language information
--------------------------------------------


In [6]:
# To get the list of synonyms:
synonyms = []
for syn in wordnet.synsets('Computer'):
  for lemma in syn.lemmas():
    synonyms.append(lemma.name())
print('synonyms of computer is', synonyms)
print('--------------------------------------------')

synonyms of computer is ['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system', 'calculator', 'reckoner', 'figurer', 'estimator', 'computer']
--------------------------------------------


In [7]:
# TO get antonyms  of word
Opposite = []
for syn in wordnet.synsets("small"):
  for l in syn.lemmas():
    if l.antonyms():
      Opposite.append(l.antonyms()[0].name())
print('antonyms of small is', Opposite)
print('-------------------------------------------')

antonyms of small is ['large', 'big', 'big']
-------------------------------------------


In [8]:
# word stemming which means removing affexes of word and returning the root word
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print('word stemming for computation is', stemmer.stem('computation'))
print('-------------------------------------------')

word stemming for computation is comput
-------------------------------------------


### **stemming words in other languages**

In [9]:
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
german_stemmer = SnowballStemmer('german')
print('\nword stemming for trinken', german_stemmer.stem('trinken'))

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')

word stemming for trinken trink


In [10]:
# word lemmatizing which is similar to stemming but returns a real word 
#the result could be verb, noun, adjective and adverb

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print('verb of living is',lemmatizer.lemmatize('living', pos='v')) #verb 
print('noun of believes is ',lemmatizer.lemmatize('believes', pos='n')) #noun


verb of living is live
noun of believes is  belief


### **removing stop words from a text**

In [11]:
# removing stop words
nltk.download('stopwords')

from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords

data = 'a text is the main body of a book or other piece of writing, as distinct from other material such as notes, appendices, and illustrations.'
stop_words = set(stopwords.words('english'))
words = word_tokenize(data)
filtered_words = []

for w in words:
  if w not in stop_words:
    filtered_words.append(w)
print('important words are: ',filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
important words are:  ['text', 'main', 'body', 'book', 'piece', 'writing', ',', 'distinct', 'material', 'notes', ',', 'appendices', ',', 'illustrations', '.']


### ***POS Tagging***

In [12]:
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
text = word_tokenize('Today is a nice, sunny day')
print(pos_tag(text))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('nice', 'JJ'), (',', ','), ('sunny', 'JJ'), ('day', 'NN')]


### **classifier to distigush between male and female names**

In [29]:
nltk.download('names')

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names

def gender_feature(word):
  return {'last_letter' : word[-1]}

#load data and training
Names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])

features_sets = [(gender_feature(Name), Gen) for (Name, Gen) in Names]
train_set = features_sets
classifier = nltk.NaiveBayesClassifier.train(train_set)

#predict
print('Lora is ' , classifier.classify(gender_feature('Lora')))

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
Lora is  female
