# Natural Language Processing

In [46]:
import numpy as np
import pandas as pd
import os
import re
import nltk

## 1. Basic Text Process

### 1.1. Load Folder

In [80]:
path = '../data/Books/hobbies/'
folder = os.listdir(path)

print(folder)

for folder_name in folder:
    print(path + folder_name)

['fishing', 'hiking', 'machinelearning', 'mathematics']
./topics/fishing
./topics/hiking
./topics/machinelearning
./topics/mathematics


### 1.2. Local Folder Text

In [107]:
data = pd.DataFrame()

# loop each sub-folder
for folder_name in folder:
    
    # create full sub-folder path
    file_name = os.listdir(path + folder_name)
    
    # loop each text
    for file in file_name:
        
        # read txt file
        f = open(folder_path + folder_name + '/' + file, 'r', encoding='ISO-8859-1')
        tmp_read = str(f.read())
        
        # store dataframe
        tmp = pd.DataFrame([tmp_read], columns=['body'])
        tmp['label'] = folder_name
        data = data.append(tmp, ignore_index=True)
        f.close()

print(data.head())

                                                body    label
0  DNR - Weekly Fishing Report\nDNR Home Contact ...  fishing
1  DNR: Fishing Guide & Regulations\nÃ Ã Header...  fishing
2  NH Hunting and Fishing Licenses | New Hampshir...  fishing
3  46 Bait & Tackle - CLOSED - Outdoor Gear - 22 ...  fishing
4  5 rescued from capsized sport fishing boat at ...  fishing


In [108]:
data.label.value_counts()

machinelearning    70
mathematics        70
fishing            64
hiking             61
Name: label, dtype: int64

### 1.3. Lookup Character

In [110]:
my_string = 'a cat%% jumped# #over #the !!dog&*^'
result = re.findall('#', my_string)
print(result)
print(len(result))

['#', '#', '#']
3


### 1.5. Word Tokenization

In [112]:
from nltk.tokenize import word_tokenize

string_token = word_tokenize(my_new_string)
print(string_token)

['a', 'cat', 'jumped', 'over', 'the', 'dog']


### 1.6. Remove Special Character

In [115]:
# From string
my_new_string = re.sub('[^a-zA-Z0-9]+', ' ', my_string)
print('Cleaned string: ', my_new_string)

# From list
string_token_clean = [word for word in string_token if word.isdigit() or word.isalpha()]
print('Cleaned list: ', string_token_clean)

Cleaned string:  a cat jumped over the dog 
Cleaned list:  ['a', 'cat', 'jumped', 'over', 'the', 'dog']


### 1.7. Clean Stopwords

In [98]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
clean_string = [word for word in word_tokenize(my_new_string) if word not in stop_words]

print('Remove stopwords: ', clean_string)

Remove stopwords:  ['cat', 'jumped', 'dog']


### 1.8. Porter Stemmer

The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process that is usually done when setting up Information Retrieval systems.

In [116]:
from nltk.stem import PorterStemmer

sentence = 'running downing laughing the hill ran run'
my_stemmer = PorterStemmer()

[my_stemmer.stem(word) for word in sentence.split()]

['run', 'down', 'laugh', 'the', 'hill', 'ran', 'run']

### 1.9. Term Frequency

In [117]:
from nltk.probability import FreqDist

sentence = 'abc abc B b C c c d F f g z d f g z a e d c'

dict_sentence = dict(FreqDist(sentence.lower().split()))
print(dict_sentence)

{'abc': 2, 'b': 2, 'c': 4, 'd': 3, 'f': 3, 'g': 2, 'z': 2, 'a': 1, 'e': 1}


In [118]:
np.unique(sentence.lower().split(), return_counts=True)[0]

array(['a', 'abc', 'b', 'c', 'd', 'e', 'f', 'g', 'z'], dtype='<U3')

In [119]:
from sklearn.feature_extraction.text import CountVectorizer

doc1 = 'the brown fox jumped over the fence'
doc2 = 'the brown fox sat on the grass GRASS'

vectorizer = CountVectorizer()
vector = vectorizer.fit_transform([doc1, doc2])

my_pd = pd.DataFrame(vector.toarray())
my_pd.columns = vectorizer.get_feature_names()

my_pd.head()

Unnamed: 0,brown,fence,fox,grass,jumped,on,over,sat,the
0,1,1,1,0,1,0,1,0,2
1,1,0,1,2,0,1,0,1,2


### 1.10. Term Frequency Inverse Document Frequency (TF-IDF)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vector = vectorizer.fit_transform([doc1, doc2])
my_pd = pd.DataFrame(vector.toarray())
my_pd.columns = vectorizer.get_feature_names()

my_pd.head()

Unnamed: 0,brown,fence,fox,grass,jumped,on,over,sat,the
0,0.289569,0.40698,0.289569,0.0,0.40698,0.0,0.40698,0.0,0.579139
1,0.236677,0.0,0.236677,0.665283,0.0,0.332642,0.0,0.332642,0.473355


### 1.11. Part-of-speechTagging (grammatical)

In [19]:
sentence = 'patrickran was running from the bear in the woods'
sentence_tag = nltk.pos_tag(sentence.split())

print('taged sentence: ', sentence_tag)

valid_pos = ['NN', 'JJ', 'VBG', 'NNS']
sentence_clean_pos = [word for word in sentence.split()
                     if nltk.pos_tag([word])[0][1] in valid_pos]

print('selected pos: ', sentence_clean_pos)

taged sentence:  [('patrickran', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('from', 'IN'), ('the', 'DT'), ('bear', 'NN'), ('in', 'IN'), ('the', 'DT'), ('woods', 'NNS')]
selected pos:  ['patrickran', 'running', 'bear', 'woods']


### 1.12 Topic Modeling

In [120]:
stop_words = stopwords.words('english')
stop_words.extend(['vs', 'year', 'info', 'new'])

folder_path = '../data/Books/hobbies/machinelearning/'
directory = os.listdir(folder_path)

df = pd.DataFrame()

for file in directory:
        f = open(folder_path + file, 'r', encoding='ISO-8859-1')
        temp = str(f.read())
        temp = re.sub('[^a-zA-Z]+', ' ', temp)
        temp = [word.lower() for word in temp.split() if word.lower() not in stop_words]    

        df = df.append([[temp]])
        f.close()

In [121]:
import gensim
import gensim.corpora as corpora

id2word = corpora.Dictionary(df[0])
corpus = [id2word.doc2bow(text) for text in df[0]]

n_topics = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word=id2word, passes=45)
topics = ldamodel.print_topics(num_words=10)
topics

[(0,
  '0.025*"maa" + 0.022*"mathematics" + 0.012*"math" + 0.008*"brook" + 0.008*"stony" + 0.006*"mathematical" + 0.005*"university" + 0.005*"students" + 0.004*"gallery" + 0.004*"program"'),
 (1,
  '0.102*"university" + 0.010*"technology" + 0.010*"mathematicians" + 0.008*"employment" + 0.006*"data" + 0.006*"state" + 0.005*"science" + 0.005*"institute" + 0.005*"degree" + 0.004*"mathematics"'),
 (2,
  '0.017*"mathematics" + 0.010*"overview" + 0.010*"f" + 0.007*"matematika" + 0.006*"cambridge" + 0.005*"ap" + 0.005*"students" + 0.005*"published" + 0.004*"matem" + 0.004*"history"'),
 (3,
  '0.030*"list" + 0.020*"mathematics" + 0.017*"ago" + 0.013*"theory" + 0.011*"topics" + 0.009*"mathematical" + 0.009*"mins" + 0.008*"named" + 0.007*"things" + 0.006*"article"'),
 (4,
  '0.019*"mathematics" + 0.016*"math" + 0.012*"research" + 0.011*"faculty" + 0.009*"graduate" + 0.009*"department" + 0.009*"students" + 0.008*"news" + 0.007*"mathematical" + 0.007*"university"')]

### 1.13. NLTK Books

In [122]:
import nltk.book

books = nltk.corpus.gutenberg.fileids()

print(books)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [123]:
book = 'austen-emma.txt'

text = nltk.corpus.gutenberg.raw(book)
words = nltk.corpus.gutenberg.words(book)
sentenses = nltk.corpus.gutenberg.sents(book)

print('Raw Text')
print(text[:200])
print('Raw Words: ', words[:200])
print('Raw Sentense: ', sentenses[:200])

Raw Text
[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; an
Raw Words:  ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]
Raw Sentense:  [['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...]


## 2. Text Mining

In [127]:
import numpy as np
import pandas as pd
import os
import re
import nltk
import nltk.book
from nltk.probability import FreqDist

summary_book = pd.DataFrame()

books = nltk.corpus.gutenberg.fileids()
for book in books:
    sentenses = nltk.corpus.gutenberg.sents(book)
    words = nltk.corpus.gutenberg.words(book)
    clean_words = [word for word in words if word.isdigit() or word.isalpha()]
    
    num_word_sent = np.array([len(sent) for sent in sentenses])
    tf = dict(FreqDist(clean_words))
    top_five = sorted(tf, key=tf.get, reverse=True)[:5]
    top_five = ','.join(top_five)
    bot_five = sorted(tf, key=tf.get, reverse=True)[-5:]
    bot_five = ','.join(bot_five)
    
    
    summary_book = summary_book.append({'Books': book, 
                                        'AveWord/Sent': round(np.mean(num_word_sent), 2),
                                        'MedWord/Sent': round(np.median(num_word_sent), 2),
                                        'Longest_Length': np.max(num_word_sent),
                                        'Shortest_Length': np.min(num_word_sent),
                                        'TopWords': top_five,
                                        'BottomWords': bot_five}, ignore_index=True)

summary_book.head()

Unnamed: 0,AveWord/Sent,Books,BottomWords,Longest_Length,MedWord/Sent,Shortest_Length,TopWords
0,24.83,austen-emma.txt,"stare,deficiencies,predictions,band,FINIS",274.0,17.0,1.0,"to,the,and,of,I"
1,26.2,austen-persuasion.txt,"defiance,accessions,sunshine,national,Finis",217.0,19.0,1.0,"the,to,and,of,a"
2,28.33,austen-sense.txt,"ranked,disagreement,producing,THE,END",303.0,22.0,1.0,"to,the,of,and,her"
3,33.57,bible-kjv.txt,"sardonyx,chrysolyte,chrysoprasus,transparent,p...",644.0,28.0,1.0,"the,and,of,to,And"
4,19.08,blake-poems.txt,"Virgin,started,seat,Fled,unhinderd",93.0,17.0,1.0,"the,And,and,of,I"


In [137]:
a = dict(FreqDist([words for words in raw_text.split() if words.isdigit() or words.isalpha()]))

pd.Series(a)

by              543
Jane            199
Austen            1
VOLUME            3
I              2602
               ... 
detailed          1
stare             1
predictions       1
band              1
FINIS             1
Length: 6224, dtype: int64

## 3. NLP Machine Learning

In [146]:
import re
import os
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

In [151]:
def clean_up(x):

    tmp = re.sub('[^a-zA-Z]+', ' ', x)
    tmp = [my_stemmer.stem(word) for word in tmp.lower().split() if word not in stop_words]
    tmp = ' '.join(tmp)

    return tmp

def load_txt(path):
    
    folder = os.listdir(path)
    df = pd.DataFrame()
    
    for folder_name in folder:
        
        files = os.listdir(path + folder_name)
        for txt in files[0:100]:
            
            f = open(path + folder_name + '/' + txt, "r", encoding='ISO-8859-1')
            tmp_read = str(f.read())
            tmp = pd.DataFrame([clean_up(tmp_read)], columns=['body'])
            tmp['label'] = folder_name
            df = df.append(tmp, ignore_index=True)
            f.close()
            
    return df

In [143]:
data = load_txt('../data/Books/topics/')
data.head()

In [148]:
#tf-idf feature matrix creation
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data.body).toarray()
col_names = vectorizer.get_feature_names()
data_tfidf = pd.DataFrame(tfidf, columns=col_names)
data_tfidf

Unnamed: 0,aa,aac,aaron,ab,abandon,abayen,abbey,abc,abdomen,abe,...,zimmer,zimmett,zip,zipcar,ziplin,zmlxpcsgfr,zone,zoom,zucman,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
data_tfidf.shape

(264, 7475)

In [162]:
pca = PCA(n_components=200)
pca.fit(data_tfidf)

print('Total Variance: ', sum(pca.explained_variance_ratio_))

data_pc = pca.transform(data_tfidf)
pd.DataFrame(data_pc)

Total Variance:  0.9030647825708786


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.181387,-0.100143,0.200938,-0.081673,0.039254,-0.013669,0.000624,-0.023974,0.020803,0.051596,...,-0.038421,-0.039302,0.031983,-0.038370,0.015136,0.007553,-0.013736,0.089119,0.066295,0.019585
1,-0.063698,-0.032335,0.033231,0.033399,0.035712,0.025147,0.018052,0.002998,0.043760,0.157014,...,-0.022957,0.111837,-0.044980,0.034011,0.027529,-0.061929,0.022731,-0.067267,0.001640,0.015159
2,-0.114146,-0.058994,0.115344,-0.025361,-0.008160,0.023572,0.008868,-0.038497,-0.011977,0.009178,...,-0.028164,0.003159,0.074912,-0.028311,-0.042473,-0.032774,0.014404,0.007798,-0.038052,-0.084498
3,-0.121585,-0.076656,0.131494,-0.045342,-0.034949,0.027078,0.028620,-0.026378,-0.021362,0.040661,...,-0.029123,-0.035085,0.045120,0.008247,0.011360,0.005176,-0.012376,-0.001237,-0.019782,0.012156
4,-0.081335,0.005851,0.052194,0.073333,0.081594,-0.075969,-0.302962,0.578678,0.149967,-0.020976,...,0.033049,0.020162,0.090364,0.053192,-0.112766,0.041583,-0.092422,-0.012924,-0.042952,-0.002383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.037523,0.057915,-0.026442,0.056231,0.002768,-0.010611,-0.001080,0.017048,0.057915,-0.039723,...,0.006112,0.025752,-0.016650,-0.027616,-0.015633,-0.004495,0.022181,0.018584,-0.008666,-0.010230
260,-0.141941,0.963268,0.098784,-0.045705,-0.018188,-0.014100,0.010137,-0.015987,-0.022487,-0.015067,...,-0.003068,-0.001106,-0.002759,-0.001834,-0.003469,-0.000341,0.000780,-0.002460,-0.000623,0.000753
261,0.434892,-0.000189,0.051518,-0.137243,0.017153,-0.026002,-0.030151,-0.014079,-0.084019,-0.272144,...,-0.019421,-0.005590,-0.026385,0.082111,0.092897,0.040259,-0.017303,-0.012520,0.007869,-0.016746
262,0.426131,0.010226,0.048964,-0.097321,-0.049223,0.063168,-0.030065,0.049593,-0.123007,0.000517,...,0.010875,0.058438,-0.020102,-0.008643,-0.049832,-0.076775,0.031613,-0.005486,-0.047601,-0.005989


In [161]:
data_pc.shape

(264, 200)