In [33]:
# load all packages
from docx import *
import pandas as pd
import numpy as np
import os
import nltk
# nltk.download('popular')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
from textblob import TextBlob
from textblob import Word


In [34]:
# extract cells
full_data = pd.read_excel('test_code.xlsx')
df_length = len(full_data)
data = pd.DataFrame()

for i in range(0, df_length):
    data = data.append({'par_#': i + 1, 'text': (full_data.iloc[i, 19])}, ignore_index = True)
data.head(7)  

Unnamed: 0,par_#,text
0,1.0,This seemed to me too profound a joke to be in...
1,2.0,"""My nature is subdued"
2,3.0,"To what it works in, like the dyer's hand:"
3,4.0,"Pity me, then, and wish I were renewed!"""
4,5.0,But as it is wholesome that the parsimonious p...
5,6.0,There is only one other point on which I offer...
6,7.0,In Bleak House I have purposely dwelt upon the...


In [35]:
# pre-processing
# turn into lower case
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head(7)

Unnamed: 0,par_#,text
0,1.0,this seemed to me too profound a joke to be in...
1,2.0,"""my nature is subdued"
2,3.0,"to what it works in, like the dyer's hand:"
3,4.0,"pity me, then, and wish i were renewed!"""
4,5.0,but as it is wholesome that the parsimonious p...
5,6.0,there is only one other point on which i offer...
6,7.0,in bleak house i have purposely dwelt upon the...


In [4]:
# pre-processing
# remove punctuation
# ^ specifies NOT to match that group or set
# \s is whitespace characters
# \w is alphanumeric character
data['text'] = data['text'].str.replace('[^\w\s]', '')
data.head(7)

Unnamed: 0,interview_#,text
0,1.0,in chancery
1,2.0,london michaelmas term lately over and the lor...
2,3.0,fog everywhere fog up the river where it flows...
3,4.0,gas looming through the fog in divers places i...
4,5.0,the raw afternoon is rawest and the dense fog ...
5,6.0,never can there come fog too thick never can t...
6,7.0,on such an afternoon if ever the lord high cha...


In [36]:
# pre-processing
# remove stop words (commonly used words: the, a, an, in)
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data.head(7)

Unnamed: 0,par_#,text
0,1.0,seemed profound joke inserted body book restor...
1,2.0,"""my nature subdued"
2,3.0,"works in, like dyer's hand:"
3,4.0,"pity me, then, wish renewed!"""
4,5.0,"wholesome parsimonious public know doing, stil..."
5,6.0,one point offer word remark. possibility calle...
6,7.0,bleak house purposely dwelt upon romantic side...


In [37]:
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in ['ms', 'mc']))
data.head(7)

Unnamed: 0,par_#,text
0,1.0,seemed profound joke inserted body book restor...
1,2.0,"""my nature subdued"
2,3.0,"works in, like dyer's hand:"
3,4.0,"pity me, then, wish renewed!"""
4,5.0,"wholesome parsimonious public know doing, stil..."
5,6.0,one point offer word remark. possibility calle...
6,7.0,bleak house purposely dwelt upon romantic side...


In [57]:
# pre-processing
# rare words removal - remove all words that only show up one time

freq = pd.Series(''.join(data['text']).split()).value_counts()[-22386:]
freq

crouches         1
donnys           1
proceeding.      1
moaned           1
connivance,      1
                ..
impartially      1
compelling       1
pleases          1
neck-kerchief    1
away."my         1
Length: 22386, dtype: int64

In [58]:
data['text'] = data['text'].apply(lambda x: " ".join(y for y in x.split() if y not in freq))
data

Unnamed: 0,par_#,text
0,1.0,seemed profound joke body book restored conver...
1,2.0,"""my nature subdued"
2,3.0,"works in, like hand:"
3,4.0,"pity me, then, wish renewed!"""
4,5.0,"wholesome parsimonious public know doing, stil..."
...,...,...
7172,7173.0,"""such were,"
7173,7174.0,"""my dear dame durden,"" said allan, drawing arm..."
7174,7175.0,"""you know do; see it."""
7175,7176.0,"""and know prettier ever were?"""


In [59]:
# pre-processing
# lemmatization, convert words into root words
## MT - This seems to do the same thing as the kernel 2 above this one
data['text'] = data['text'].apply(lambda x: " ".join([Word(y).lemmatize() for y in x.split()]))
data

Unnamed: 0,par_#,text
0,1.0,seemed profound joke body book restored conver...
1,2.0,"""my nature subdued"
2,3.0,"work in, like hand:"
3,4.0,"pity me, then, wish renewed!"""
4,5.0,"wholesome parsimonious public know doing, stil..."
...,...,...
7172,7173.0,"""such were,"
7173,7174.0,"""my dear dame durden,"" said allan, drawing arm..."
7174,7175.0,"""you know do; see it."""
7175,7176.0,"""and know prettier ever were?"""


In [60]:
## MT - With words: Tokenize, add POS tags (list), turn to trees with `ne_chunk`, create array of chunks with allChunks
allChunks = []
for i in data['text']:
    token = word_tokenize(i)
    tags = nltk.pos_tag(token)
    chunk = ne_chunk(tags)
    allChunks.append(chunk)
allChunks

[Tree('S', [('seemed', 'VBN'), ('profound', 'JJ'), ('joke', 'NN'), ('body', 'NN'), ('book', 'NN'), ('restored', 'VBD'), ('conversation', 'NN'), ('kenge', 'NN'), ('mr.', 'NN'), ('vholes', 'NNS'), (',', ','), ('one', 'CD'), ('think', 'NN'), ('must', 'MD'), ('might', 'MD'), ('coupled', 'VB'), ('apt', 'JJ'), ('one', 'CD'), ('sonnets', 'NNS'), (':', ':')]),
 Tree('S', [('``', '``'), ('my', 'PRP$'), ('nature', 'NN'), ('subdued', 'VBD')]),
 Tree('S', [('work', 'NN'), ('in', 'IN'), (',', ','), ('like', 'IN'), ('hand', 'NN'), (':', ':')]),
 Tree('S', [('pity', 'NN'), ('me', 'PRP'), (',', ','), ('then', 'RB'), (',', ','), ('wish', 'JJ'), ('renewed', 'VBN'), ('!', '.'), ("''", "''")]),
 Tree('S', [('wholesome', 'RB'), ('parsimonious', 'JJ'), ('public', 'NN'), ('know', 'VBP'), ('doing', 'VBG'), (',', ','), ('still', 'RB'), ('doing', 'VBG'), (',', ','), ('connexion', 'NN'), (',', ','), ('mention', 'NN'), ('everything', 'NN'), ('set', 'VBN'), ('forth', 'JJ'), ('page', 'NN'), ('concerning', 'VBG'), (

In [61]:
## MT - takes ~ 3 minutes to run this kernel
# create tf-idf matrix
# how important a word is to a document
textVal = data.text.values.astype('str')
vectorizer = TfidfVectorizer()
vectorizer.fit(textVal)
X = vectorizer.transform (textVal).toarray()
names = vectorizer.get_feature_names()
tfidf_dataframe = pd.DataFrame(X, columns = names)


In [62]:
## MT - Export to .xlsx file

while True:
    print('filename: ')
    name = input() + ".xlsx"
    if os.path.isfile(name):
        print("file already exists")
        continue
    else:
        writer = pd.ExcelWriter(name, engine='xlsxwriter')
        print("creating...")
        break

# To print TF-IDF
tfidf_dataframe.to_excel(writer)

writer.save()
print("complete")

filename: 
tf-idf_test-code
creating...


AttributeError: 'DataFrame' object has no attribute 'data'