## Natural Language Processing Studying

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy

import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

nltk.download('gutenberg')

import en_core_web_sm

nlp = en_core_web_sm.load()


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\genta\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
# Import the data we just downloaded and installed.
from nltk.corpus import gutenberg, stopwords

# Grab and process the raw data.
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Print the first 100 characters of Alice in Wonderland.
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [71]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [72]:
# Parse the cleaned novels. This can take a bit.

nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [73]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()


Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [74]:
sentences.tail()

Unnamed: 0,0,1
6008,"(Her, spring, of, felicity, was, in, the, glow...",Austen
6009,"(Anne, was, tenderness, itself, ,, and, she, h...",Austen
6010,"(His, profession, was, all, that, could, ever,...",Austen
6011,"(She, gloried, in, being, a, sailor, 's, wife,...",Austen
6012,(Finis),Austen


In [77]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    #text_sentence is actual sentences
    df['text_source'] = sentences[1]
    #text_source is a
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [78]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000


Unnamed: 0,decide,sincere,soon,hippopotamus,gratification,space,crown,tiny,tremulous,sage,...,Hall,cardboard,boon,comfit,dear,elbow,curacy,master,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"(Oh, dear, !)",Carroll


In [79]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.9714444136401441

Test set score: 0.8553615960099751


In [80]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(3607, 3088) (3607,)
Training set score: 0.9331854726919878

Test set score: 0.8682460515378221


In [81]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8616578874410867

Test set score: 0.8383208645054031


In [85]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [86]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [87]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [88]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
done


In [89]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6805807622504537


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1631,85
Carroll,795,244


Well look at that!  NLP approaches are generally effective on the same type of material as they were trained on. It looks like this model is actually able to differentiate multiple works by Austen from Alice in Wonderland.  Now the question is whether the model is very good at identifying Austen, or very good at identifying Alice in Wonderland, or both...

# Challenge 0:

Recall that the logistic regression model's best performance on the test set was 93%.  See what you can do to improve performance.  Suggested avenues of investigation include: Other modeling techniques (SVM?), making more features that take advantage of the spaCy information (include grammar, phrases, POS, etc), making sentence-level features (number of words, amount of punctuation), or including contextual information (length of previous and next sentences, words repeated from one sentence to the next, etc), and anything else your heart desires.  Make sure to design your models on the test set, or use cross_validation with multiple folds, and see if you can get accuracy above 90%.  

# Challenge 1:
Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work.  This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

Record your work for each challenge in a notebook and submit it below.

# Challenge 0:

In [90]:
#SVM

from sklearn import svm
svm = svm.SVC()
train = svm.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))



(3607, 3088) (3607,)
Training set score: 0.7119489880787357

Test set score: 0.7186201163757273


 SVM helped with overfitting but the score was worse. 

In [91]:
def parts_of_speech(df):
    counts = [] # initialize list
    for sentence in df.text_sentence: # loop through rows in df.text_sentence column, intialize vars to zero. 
        verbs = 0
        adverbs = 0
        adjectives = 0
        nouns = 0
        pronouns = 0
        numbers = 0
   
        
        
        length_sentence = len(sentence)
        for token in sentence: #loop through words/tokens in sentence and add one if matched
            if token.pos_ == 'VERB':
                verbs +=1
            elif token.pos_ == 'ADV':
                adverbs +=1
            elif token.pos_ == 'ADJ':
                adjectives +=1
            elif token.pos_ == 'NOUN':
                nouns +=1
            elif token.pos_ == 'PRON':
                pronouns +=1
            elif token.pos_ == 'NUM':
                numbers +=1
                
        counts.append([length_sentence, verbs, adverbs, adjectives, nouns, pronouns, numbers]) # append counts to each variable
    df_pos = pd.DataFrame.from_records(counts, columns=['verbs', 'adverbs', 'adjectives', 
                                                        'nouns', 'pronouns',  
                                                        'numbers', 
                                                        'length_sentence']) # turn lists into columns
    df_concat = pd.concat([df, df_pos], axis =1)
    
    return df_concat
wc_features = parts_of_speech(word_counts)

wc_features.head()

Unnamed: 0,decide,sincere,soon,hippopotamus,gratification,space,crown,tiny,tremulous,sage,...,master,text_sentence,text_source,verbs,adverbs,adjectives,nouns,pronouns,numbers,length_sentence
0,0,0,0,0,0,0,0,0,0,0,...,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,67,13,3,1,11,4,0
1,0,0,0,0,0,0,0,0,0,0,...,0,"(So, she, was, considering, in, her, own, mind...",Carroll,63,11,7,6,8,4,0
2,0,0,0,0,0,0,0,0,0,0,...,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,30,5,6,1,2,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,"(Oh, dear, !)",Carroll,3,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,"(Oh, dear, !)",Carroll,3,0,0,1,0,0,0


In [92]:
Y1 = wc_features['text_source']
X1 = np.array(wc_features.drop(['text_sentence','text_source'], 1))

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, 
                                                    Y1,
                                                    test_size=0.4,
                                                    random_state=42)

lr1 = LogisticRegression()
train1 = lr1.fit(X_train1, y_train1)
print(X_train1.shape, y_train1.shape)
print('Training set score:', lr1.score(X_train1, y_train1))
print('\nTest set score:', lr1.score(X_test1, y_test1))



(3607, 3095) (3607,)
Training set score: 0.9301358469642362

Test set score: 0.8757273482959268


In [93]:
#SVM
from sklearn import svm
svm1 = svm.SVC()
train1 = svm1.fit(X_train1, y_train1)
print(X_train1.shape, y_train1.shape)
print('Training set score:', svm1.score(X_train1, y_train1))
print('\nTest set score:', svm1.score(X_test1, y_test1))



(3607, 3095) (3607,)
Training set score: 0.724701968394788

Test set score: 0.699501246882793


In [94]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [95]:

log_reg_params = {"penalty": ['l1', 'l2'], 
                  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
                  'random_state':[42]}

grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params,  cv=3, verbose=0)
grid_log_reg.fit(X_test1, y_test1)
# logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_

In [96]:
#SVC Tuning
%timeit pass

svm_params =  {'C':[1,10,100,1000],
               'gamma':['scale',1,0.1,0.001,0.0001], 
               'kernel':['linear','rbf'], 
               'random_state':[42]}

grid_svm = GridSearchCV(svm.SVC(), svc_params)
grid_svm.fit(X_train1, y_train1)

## SVC best parameters
svm_best = grid_svm.best_estimator_

10 ns ± 1.37 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [98]:
print('Best Parameters:')
print('\n')
print("Logistic Regression:\n",grid_log_reg.best_params_)
print('\n')
print("Support Vector:\n",grid_svm.best_params_)

Best Parameters:


Logistic Regression:
 {'C': 10, 'penalty': 'l2', 'random_state': 42}


Support Vector:
 {'C': 1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}


In [99]:
lr1 = LogisticRegression(C=10, penalty='l2')
train1 = lr1.fit(X_train1, y_train1)
print(X_train1.shape, y_train1.shape)
print('Training set score:', lr1.score(X_train1, y_train1))
print('\nTest set score:', lr1.score(X_test1, y_test1))

(3607, 3095) (3607,)
Training set score: 0.9667313556972553

Test set score: 0.8865336658354115
