In [256]:
import pandas as pd
import re
import numpy as np

In [257]:
filename = '/Users/mac/Desktop/MLT/alice.txt'
with open(filename, encoding='utf-8') as f:
    alice_in_wonderland = f.read()
alice_in_wonderland[:100]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is fo'

In [258]:
textfile = alice_in_wonderland.split('CHAPTER I.')[2]
textfile = textfile.split('THE END')[0]
textfile[:100]

'\nDown the Rabbit-Hole\n\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, '

In [259]:
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer    
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("wordnet")
nltk.download('stopwords')
stop_words = stopwords.words("english")

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
     

[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [260]:
def preprocess(input_text):
    lower_input = input_text.lower()
    lower_input = re.sub(r"won\'t", "will not", lower_input)
    lower_input = re.sub(r"can\'t", "can not", lower_input)
    lower_input = re.sub(r"\'re", " are", lower_input)
    lower_input = re.sub(r"\'s", " is", lower_input)
    lower_input = re.sub(r"\'d", " would", lower_input)
    lower_input = re.sub(r"\'ll", " will", lower_input)
    lower_input = re.sub(r"\'t|n\'t", " not", lower_input)
    lower_input = re.sub(r"\'ve", " have", lower_input)
    lower_input = re.sub(r"\'m", " am", lower_input)
    lower_input = re.sub(r"[^\w\s]|_", "", lower_input)
    words = lower_input.split()
    stop_words = set(stopwords.words('english'))
    words_after_sw = [word for word in words if not word in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemm_tokens = [lemmatizer.lemmatize(word) for word in words_after_sw]
    #"v" means verb
    lemm_tokens = [lemmatizer.lemmatize(word, "v") for word in words_after_sw]    
    result = " ".join(lemm_tokens) 
    return result

In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer
chapters = textfile.split('CHAPTER ')
chapter_num = 1
chapter_list = []
for chapter in chapters:
    chapter = preprocess(chapter)
    chapter = re.sub(r'alice', '', chapter) 
    chapter_list.append(chapter)
chapter_list = np.array(chapter_list)

In [262]:
chapter_list[1]

'ii pool tear curiouser curiouser cry  much surprise moment quite forget speak good english im open like largest telescope ever goodbye feet look feet seem almost sight get far oh poor little feet wonder put shoe stock dears im sure shant able shall great deal far trouble must manage best way canbut must kind think  perhaps wont walk way want go let see ill give new pair boot every christmas go plan would manage must go carrier think funny itll seem send present ones feet odd directions look s right foot esq hearthrug near fender s love oh dear nonsense im talk head strike roof hall fact nine feet high take little golden key hurry garden door poor  much could lie one side look garden one eye get hopeless ever sit begin cry ought ashamed say  great girl like might well say go cry way stop moment tell go shed gallons tear large pool round four inch deep reach half hall time hear little patter feet distance hastily dry eye see come white rabbit return splendidly dress pair white kid glove

In [263]:
#there are 12 chapters
chapter_list.shape

(12,)

In [264]:
num = list(range(1,13))
index = list(range(0,12))

In [265]:
texts = {'index': num, 'text': chapter_list}
text_pd = pd.DataFrame(data=texts, index=index)
text_pd

Unnamed: 0,index,text
0,1,rabbithole begin get tire sit sister bank not...
1,2,ii pool tear curiouser curiouser cry much sur...
2,3,iii caucusrace long tale indeed queerlooking p...
3,4,iv rabbit send little bill white rabbit trot s...
4,5,v advice caterpillar caterpillar look time si...
5,6,vi pig pepper minute two stand look house wond...
6,7,vii mad teaparty table set tree front house ma...
7,8,viii queen croquetground large rosetree stand ...
8,9,ix mock turtle story cant think glad see dear ...
9,10,x lobster quadrille mock turtle sigh deeply dr...


In [266]:
vectorizer_tfidf = TfidfVectorizer(max_features=3000,stop_words='english')

In [267]:
vectorizer_tfidf.fit(text_pd["text"])

TfidfVectorizer(max_features=3000, stop_words='english')

In [268]:
def tfidf(text):
    response = vectorizer_tfidf.transform(text)
    feature_array = np.array(vectorizer_tfidf.get_feature_names_out())
    
    return response , feature_array

In [269]:
text_tfidf , feature_array = tfidf(text_pd["text"])
#text_tfidf = vectorizer_tfidf.transform(text_pd["text"])
     
tfidf_df = pd.DataFrame(text_tfidf.toarray(), columns=feature_array)  

tfidf_df.shape

(12, 1956)

In [270]:
most_frequent_word = tfidf_df.T
most_frequent_word

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
abide,0.000000,0.000000,0.000000,0.000000,0.000000,0.022798,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
able,0.000000,0.032543,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
absence,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020212,0.000000,0.000000,0.000000
absurd,0.000000,0.000000,0.026744,0.000000,0.000000,0.019580,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
acceptance,0.000000,0.000000,0.031141,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
youre,0.014070,0.000000,0.025310,0.011116,0.059245,0.046325,0.007645,0.000000,0.016428,0.007391,0.028252,0.010906
youth,0.000000,0.000000,0.000000,0.000000,0.145785,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
youve,0.000000,0.000000,0.000000,0.000000,0.015002,0.000000,0.000000,0.014299,0.000000,0.022458,0.014308,0.016570
zealand,0.034621,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [271]:
chapter_num = 1
index = list(range(1,11))
for i in most_frequent_word:
    most10 = most_frequent_word[:][i].nlargest(10)
    print("chapter%d"%(chapter_num))
    chapter_num +=1
    print(most10)
    print()

chapter1
think     0.229056
eat       0.213768
say       0.192889
little    0.180834
bat       0.178399
key       0.157590
fall      0.149638
like      0.132611
way       0.132611
door      0.125361
Name: 0, dtype: float64

chapter2
mouse     0.321500
say       0.215307
pool      0.197508
little    0.192643
im        0.171925
cat       0.160750
think     0.158648
swim      0.155119
dear      0.143051
fan       0.139742
Name: 1, dtype: float64

chapter3
say        0.433750
mouse      0.384560
dodo       0.320932
prize      0.186847
lory       0.160466
know       0.140969
dry        0.127230
thimble    0.124564
bird       0.115368
cause      0.106977
Name: 2, dtype: float64

chapter4
little    0.219068
window    0.218824
rabbit    0.198086
say       0.180969
grow      0.169353
fan       0.164438
puppy     0.164118
come      0.142870
bottle    0.140946
gloves    0.140946
Name: 3, dtype: float64

chapter5
say            0.482259
caterpillar    0.460823
pigeon         0.291569
serpent      

In [272]:
from nltk import sent_tokenize
sentences = sent_tokenize(textfile)
sentences_with_alice = []
verbs = ''
for sentence in sentences:
    preprocess_text = preprocess(sentence)
    
    if 'alice' in preprocess_text:
        preprocess_text = re.sub(r'king', '', preprocess_text) 
        sentences_with_alice.append(preprocess_text)
    else:
        continue
for sentence in sentences_with_alice:
    s = nltk.pos_tag(sentence.split())
    for w in s:
        if 'VB' in w[1]:
            verbs += w[0] + ' '
        else:
            continue
response , feature_array = tfidf([verbs])

tfidf_ranking = np.argsort(response.toarray()).flatten()[::-1]
print("10 verbs most related to alice : ")
for i in feature_array[tfidf_ranking][:10]:
        print("%s "%(i))

10 verbs most related to alice : 
say 
think 
know 
come 
make 
begin 
run 
look 
tell 
happen 
