## Install & Import


In [38]:
!pip install nltk
import nltk, string



## données

In [39]:
texte_en = """My family is a rather small one, with only three people, my father, my mother and me. My father is a doctor. My mother is a middle-school teacher. I am still a student. My mother is very gentle and soft while my father is a little bit hot-tempered. We love each other very much. Although sometimes we might become angry at one another, very soon we will forget the unpleasant things. We're a happy family, how about you?"""

In [40]:
texte_en

"My family is a rather small one, with only three people, my father, my mother and me. My father is a doctor. My mother is a middle-school teacher. I am still a student. My mother is very gentle and soft while my father is a little bit hot-tempered. We love each other very much. Although sometimes we might become angry at one another, very soon we will forget the unpleasant things. We're a happy family, how about you?"

## Tokinization

### By word

In [41]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [42]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
texte_en_words = word_tokenize(texte_en)

In [44]:
texte_en_sent = sent_tokenize(texte_en)

In [45]:
texte_en_sent

['My family is a rather small one, with only three people, my father, my mother and me.',
 'My father is a doctor.',
 'My mother is a middle-school teacher.',
 'I am still a student.',
 'My mother is very gentle and soft while my father is a little bit hot-tempered.',
 'We love each other very much.',
 'Although sometimes we might become angry at one another, very soon we will forget the unpleasant things.',
 "We're a happy family, how about you?"]

## Filtering Stop Words

In [46]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
texte_en_stop_words = set(stopwords.words("english"))

In [48]:
Filter_text = []
for word in texte_en_words:
    if word.casefold() not in texte_en_stop_words:
        Filter_text.append(word)


In [49]:
Filter_text

['family',
 'rather',
 'small',
 'one',
 ',',
 'three',
 'people',
 ',',
 'father',
 ',',
 'mother',
 '.',
 'father',
 'doctor',
 '.',
 'mother',
 'middle-school',
 'teacher',
 '.',
 'still',
 'student',
 '.',
 'mother',
 'gentle',
 'soft',
 'father',
 'little',
 'bit',
 'hot-tempered',
 '.',
 'love',
 'much',
 '.',
 'Although',
 'sometimes',
 'might',
 'become',
 'angry',
 'one',
 'another',
 ',',
 'soon',
 'forget',
 'unpleasant',
 'things',
 '.',
 "'re",
 'happy',
 'family',
 ',',
 '?']

In [50]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation  #all punctuations 

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming  English Text

In [51]:
from nltk.stem import PorterStemmer

In [52]:
Stm = PorterStemmer()

In [53]:
texte_en_words_stm = [Stm.stem(word) for word in Filter_text]


In [54]:
texte_en_words_stm

['famili',
 'rather',
 'small',
 'one',
 ',',
 'three',
 'peopl',
 ',',
 'father',
 ',',
 'mother',
 '.',
 'father',
 'doctor',
 '.',
 'mother',
 'middle-school',
 'teacher',
 '.',
 'still',
 'student',
 '.',
 'mother',
 'gentl',
 'soft',
 'father',
 'littl',
 'bit',
 'hot-temp',
 '.',
 'love',
 'much',
 '.',
 'although',
 'sometim',
 'might',
 'becom',
 'angri',
 'one',
 'anoth',
 ',',
 'soon',
 'forget',
 'unpleas',
 'thing',
 '.',
 "'re",
 'happi',
 'famili',
 ',',
 '?']

### Tagging Parts of Speech

In [55]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [56]:
texte_en_pos_tag = nltk.pos_tag(texte_en_words)

In [57]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /home/jovyan/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [58]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

### Lemmatizing

In [59]:
from nltk.stem import WordNetLemmatizer

In [60]:
lemmatizer = WordNetLemmatizer()

In [61]:
lemmatizer

<WordNetLemmatizer>

In [62]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [63]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in texte_en_words]

In [64]:
lemmatized_words

['My',
 'family',
 'is',
 'a',
 'rather',
 'small',
 'one',
 ',',
 'with',
 'only',
 'three',
 'people',
 ',',
 'my',
 'father',
 ',',
 'my',
 'mother',
 'and',
 'me',
 '.',
 'My',
 'father',
 'is',
 'a',
 'doctor',
 '.',
 'My',
 'mother',
 'is',
 'a',
 'middle-school',
 'teacher',
 '.',
 'I',
 'am',
 'still',
 'a',
 'student',
 '.',
 'My',
 'mother',
 'is',
 'very',
 'gentle',
 'and',
 'soft',
 'while',
 'my',
 'father',
 'is',
 'a',
 'little',
 'bit',
 'hot-tempered',
 '.',
 'We',
 'love',
 'each',
 'other',
 'very',
 'much',
 '.',
 'Although',
 'sometimes',
 'we',
 'might',
 'become',
 'angry',
 'at',
 'one',
 'another',
 ',',
 'very',
 'soon',
 'we',
 'will',
 'forget',
 'the',
 'unpleasant',
 'thing',
 '.',
 'We',
 "'re",
 'a',
 'happy',
 'family',
 ',',
 'how',
 'about',
 'you',
 '?']

### Chunking

In [65]:
Grammar = "NP: {<DT>?<JJ>*<NN>}"

In [66]:
Grammar


'NP: {<DT>?<JJ>*<NN>}'

NP stands for noun phrase

    Start with an optional (?) determiner ('DT')
    Can have any number (*) of adjectives (JJ)
    End with a noun (<NN>)

In [67]:
chunk_parser = nltk.RegexpParser(Grammar)

In [68]:
!pip install ghostscript



In [83]:
t = chunk_parser.parse(texte_en_pos_tag)

In [84]:
t.draw()

TclError: no display name and no $DISPLAY environment variable