<a href="https://colab.research.google.com/github/Mariammmmmmm/NLP/blob/main/lab2_alice_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task

1. Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt
2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.
3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?
4. Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

# Simple vector example

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
simple_train = ["call you tonight", "Call me a cab", "Please call me... PLEASE!"]

In [None]:
# learn the 'vocabulary' of the training data
vect_bow = CountVectorizer()
vect_bow.fit(simple_train)

CountVectorizer()

In [None]:
# examine the fitted vocabulary
vect_bow.get_feature_names_out()

array(['cab', 'call', 'me', 'please', 'tonight', 'you'], dtype=object)

In [None]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect_bow.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
# examine the vocabulary and document-term matrix together
import pandas as pd
pd.DataFrame(simple_train_dtm.toarray(), columns=vect_bow.get_feature_names())



Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


**Summary:**

- `vectorizer.fit(train)` **learns the vocabulary** of the training data

- `vectorizer.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data

- `vectorizer.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data (and **ignores tokens** it hasn't seen before)

# Alice example

In [None]:
import re
import string
import nltk
from nltk.corpus import wordnet 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

## 1. Download Alice

In [None]:
filename = 'alice_orig.txt'
with open(filename, encoding='utf-8') as f:
    alice_text = f.read()
alice_text[:500]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are locate'

## 2. Perform any necessary preprocessing on the text

In [None]:
# remove irrelevant text
alice_text = alice_text.split('CHAPTER I.')[2]
alice_text = alice_text.split('END OF THE PROJECT GUTENBERG')[0]
alice_text[:500]
alice_text



In [None]:
from nltk.tokenize import word_tokenize
# convert to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.
# CODE 1: 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english') + ['ha', 'wa', 'say', 'said','alice'])
lemmatizer = WordNetLemmatizer()
def clear(text):
  text = alice_text.lower()
  text = list(filter(str.isalpha, word_tokenize(text.lower())))
  text = list(lemmatizer.lemmatize(word) for word in text)
  text = list(word for word in text if word not in stop_words)
  new_text = ' '.join(text)
  return new_text

text = clear(alice_text)
text[:500]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


'beginning get tired sitting sister bank nothing twice peeped book sister reading picture conversation use book thought without picture conversation considering mind well could hot day made feel sleepy stupid whether pleasure making would worth trouble getting picking daisy suddenly white rabbit pink eye ran close nothing remarkable think much way hear rabbit oh dear oh dear shall late thought afterwards occurred ought wondered time seemed quite natural rabbit actually watch looked hurried starte'

## 3. Find Top 10 most important words

In [None]:
# Split text on chapters
# CODE 2: 

chapters = text.split("chapter")
print(len(chapters))

12


In [None]:
from nltk.grammar import FeatureValueType
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import re
from nltk.probability import FreqDist

# calculate tf-idf representation for chapters and word
# you can use TfidfVectorizer
# 
# CODE 3:

#response.toarray()

#for i in range(k):
 # print("Chapter ", i)
  #fdist = FreqDist(word.lower() for word in chapters[i].split(" "))
  #fdist_top10 = fdist.most_common(10)
  #print(fdist_top10)

#for i in range(k):
#  print("Chapter ", i)
#  fdist = FreqDist(word.lower() for word in chapters[i].split(" "))
#  fdist_top10 = fdist.most_common(10)
#  print(fdist_top10)
k = len(chapters)


tfidf = TfidfVectorizer();
response = tfidf.fit_transform(chapters)
tfidf.vocabulary_

response.sorted_indices()
kk = tfidf.vocabulary_
names = tfidf.get_feature_names_out()
data = response.todense().tolist()
df = pd.DataFrame(data, columns=names)

names_list = names.tolist()
chap = {}
ch = {}
for i in range(12):
  chap[i] = df.loc[i].tolist()
  ch[i] = dict(zip(names_list, chap[i]))
  ch[i] = dict(sorted(ch[i].items(), key=lambda kv:kv[1], reverse=True))


for i in range(12):
  j=0
  print("Глава", i+1)
  for k in ch[i].keys():    
    j = j + 1
    print(k)
    if j == 9: break
    



Глава 1
little
bat
door
key
eat
like
way
either
see
Глава 2
mouse
pool
little
swam
dear
cat
foot
mabel
go
Глава 3
mouse
dodo
lory
prize
dry
thimble
know
bird
dinah
Глава 4
bill
little
rabbit
puppy
window
bottle
chimney
fan
glove
Глава 5
caterpillar
pigeon
serpent
egg
youth
size
father
little
well
Глава 6
cat
footman
baby
pig
duchess
mad
wow
like
cook
Глава 7
hatter
dormouse
hare
march
time
tea
draw
clock
twinkle
Глава 8
queen
hedgehog
king
gardener
soldier
cat
five
executioner
procession
Глава 9
mock
turtle
gryphon
duchess
moral
queen
went
never
little
Глава 10
turtle
mock
gryphon
dance
lobster
join
beautiful
soup
whiting
Глава 11
king
hatter
court
dormouse
witness
queen
officer
slate
tart
Глава 12
king
jury
queen
sister
dream
slate
would
rabbit
juryman


In [None]:
# Show 10 most important words for each chapter
# CODE 4:

## 4 Find the Top 10 most used verbs in sentences with Alice. 

from nltk import sent_tokenize
import nltk
# nltk.download('punkt')

In [None]:
# tokenization - sentences case
from nltk.tokenize import sent_tokenize, word_tokenize
sent_tokenize("I. Down the Rabbit-Hole. Alice was beginning to get ... ")

['I.', 'Down the Rabbit-Hole.', 'Alice was beginning to get ...']

In [None]:
# Split text into sentences, find sentences with Alice
# CODE 5:

sents = sent_tokenize(alice_text.lower())
sents = [s for s in sents if 'alice' in s]

#for sent in sents:
#  k=0
#  for word in sent:
#    if (word == 'alice'):
#      k = k + 1
#  if k==0: sents.remove(sent)


print(sents)


['\ndown the rabbit-hole\n\n\nalice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into\nthe book her sister was reading, but it had no pictures or\nconversations in it, “and what is the use of a book,” thought alice\n“without pictures or conversations?”\n\nso she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure of\nmaking a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a white rabbit with pink eyes ran\nclose by her.', 'there was nothing so _very_ remarkable in that; nor did alice think it\nso _very_ much out of the way to hear the rabbit say to itself, “oh\ndear!', 'i shall be late!” (when she thought it over afterwards,\nit occurred to her that she ought to have wondered at this, but at the\ntime it all seemed quite natural); but when the rabbit actually _took a\nwatch out o

In [None]:

stop_words = set(stopwords.words('english') + ['ha', 'wa', 'say', 'said'])
lemmatizer = WordNetLemmatizer()

def clear_text(text):
  text = " ".join(text)
  text = list(filter(str.isalpha, word_tokenize(text)))
  text = list(lemmatizer.lemmatize(word) for word in text)
  text = list(word for word in text if word not in stop_words)
  new_text = ' '.join(text)
  return new_text

clear_sents = clear_text(sents)
print(clear_sents)

alice beginning get tired sitting sister bank nothing twice peeped book sister reading picture conversation use book thought alice without picture conversation considering mind well could hot day made feel sleepy stupid whether pleasure making would worth trouble getting picking daisy suddenly white rabbit pink eye ran close nothing remarkable alice think much way hear rabbit oh dear shall late thought afterwards occurred ought wondered time seemed quite natural rabbit actually watch looked hurried alice started foot flashed across mind never seen rabbit either watch take burning curiosity ran across field fortunately time see pop large hedge another moment went alice never considering world get went straight like tunnel way dipped suddenly suddenly alice moment think stopping found falling deep well well thought alice fall shall think nothing tumbling stair let see would four thousand mile see alice learnt several thing sort lesson schoolroom though good opportunity showing knowledge 

### Finding a Part of Speech (Pos) using NLTK

The Parts Of Speech Tag List

Following is the complete list of such POS tags.

* CC Coordinating Conjunction
* CD Cardinal Digit
* DT Determiner
* EX Existential There. Example: “there is” … think of it like “there exists”)
* FW Foreign Word.
* IN Preposition/Subordinating Conjunction.
* JJ Adjective.
* JJR Adjective, Comparative.
* JJS Adjective, Superlative.
* LS List Marker 1.
* MD Modal.
* NN Noun, Singular.
* NNS Noun Plural.
* NNP Proper Noun, Singular.
* NNPS Proper Noun, Plural.
* PDT Predeterminer.
* POS Possessive Ending. Example: parent’s
* PRP Personal Pronoun. Examples: I, he, she
* PRP Possessive Pronoun. Examples: my, his, hers
* RB Adverb. Examples: very, silently,
* RBR Adverb, Comparative. Example: better
* RBS Adverb, Superlative. Example: best
* RP Particle. Example: give up
* TO to. Example: go ‘to’ the store.
* UH Interjection. Example: errrrrrrrm
* VB Verb, Base Form. Example: take
* VBD Verb, Past Tense. Example: took
* VBG Verb, Gerund/Present Participle. Example: taking
* VBN Verb, Past Participle. Example: taken
* VBP Verb, Sing Present, non-3d take
* VBZ Verb, 3rd person sing. present takes
* WDT wh-determiner. Example: which
* WP wh-pronoun. Example: who, what
* WP possessive wh-pronoun. Example: whose
* WRB wh-abverb. Example: where, when

In [None]:
nltk.download('averaged_perceptron_tagger')
verbs = nltk.pos_tag(clear_sents.split(" "))
#print(verbs[0][0])
#verbs = dict(verbs)
#print(verbs)
list_verbs = ''
v = set(['VB','VBD','VBG','VBN','VBP','VBZ'])
#print(type(v))
v = list(v)
for i in range(len(verbs)):
  if (verbs[i][1] in v):
    list_verbs = list_verbs + verbs[i][0] + ' '

#for i in verbs.keys():
#  if (verbs[i] in v): 
#    list_verbs = list_verbs + i + ' '

list_verbs = word_tokenize(list_verbs)
list_verbs = list(lemmatizer.lemmatize(word, pos="v") for word in list_verbs)
print(list_verbs)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
['begin', 'get', 'sit', 'peep', 'read', 'consider', 'hot', 'make', 'make', 'worth', 'get', 'pick', 'run', 'think', 'think', 'occur', 'wonder', 'seem', 'watch', 'look', 'start', 'flash', 'see', 'run', 'see', 'go', 'consider', 'go', 'dip', 'think', 'stop', 'find', 'fall', 'think', 'fall', 'think', 'tumble', 'see', 'thousand', 'learn', 'show', 'knowledge', 'wonder', 'get', 'longitude', 'begin', 'talk', 'begin', 'get', 'go', 'say', 'hurt', 'look', 'sight', 'hurry', 'lose', 'go', 'turn', 'get', 'see', 'find', 'hang', 'lock', 'try', 'walk', 'wonder', 'get', 'come', 'make', 'think', 'belong', 'open', 'find', 'lead', 'look', 'saw', 'get', 'go', 'little', 'know', 'see', 'happen', 'begin', 'think', 'seem', 'wait', 'go', 'hop', 'find', 'shut', 'find', 'drink', 'print', 'drink', 'go', 'venture', 'find', 'mix', 'b

In [None]:
# Find verbs and print most common (infinitive form)
# CODE 6:

from nltk.probability import FreqDist
fdist = FreqDist()

for word in list_verbs:
  fdist[word.lower()]+=1

fdist_top10 = fdist.most_common(10)
fdist_top10

[('go', 93),
 ('get', 57),
 ('think', 57),
 ('look', 48),
 ('come', 40),
 ('begin', 39),
 ('see', 33),
 ('make', 32),
 ('take', 30),
 ('know', 26)]