In [222]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords,gutenberg
from nltk.stem import WordNetLemmatizer

In [223]:
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [224]:
# remove chapter title from text
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

In [225]:
def text_cleaner(text):
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    lmz = WordNetLemmatizer()
    text = [lmz.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [226]:
# divide texts by sentences
persuasion_sent = nltk.sent_tokenize(persuasion)
alice_sent = nltk.sent_tokenize(alice)

In [227]:
def create_df(text_sentences):    
    sentence = pd.DataFrame()
    sentence["text_sentence"] = text_sentences
    return sentence

In [228]:
# create and combine df for persuasion and alice texts
sent_p = create_df(persuasion_sent)
sent_p["text_source"] = "Austen"
sent_a = create_df(alice_sent)
sent_a["text_source"] = "Carroll"
sent_df = pd.concat([sent_p,sent_a])

In [229]:
sent_df.head(3)

Unnamed: 0,text_sentence,text_source
0,[Persuasion by Jane Austen 1818]\n\n\n\n\n\nSi...,Austen
1,This was the page at which\nthe favourite volu...,Austen
2,"""Walter Elliot, born March 1, 1760, married, J...",Austen


In [230]:
# bag of words
corpus = []
for sent in sent_df["text_sentence"].tolist():
    cleaned_sentence = text_cleaner(sent)
    corpus.append(cleaned_sentence)

In [231]:
# create sparse matrix with corpus (2000 most common words)
from sklearn.feature_extraction.text import CountVectorizer
count_v = CountVectorizer(max_features=2000)

In [232]:
# independent variables (bag of words)
X = count_v.fit_transform(corpus).toarray()

In [233]:
# dependent variable (source of text, author names)
y = sent_df.iloc[:,1].values

In [234]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

## Random Forest Classifier

In [235]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
train_rfc = rfc.fit(X_train,y_train)
print("Train score: ",rfc.score(X_train,y_train))
print("\nTest score: ",rfc.score(X_test,y_test))

Train score:  0.9803037494067395

Test score:  0.8500948766603416


## Logistic Regression

In [236]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train_lr = lr.fit(X_train,y_train)
print("Train score: ",lr.score(X_train,y_train))
print("\nTest score: ",lr.score(X_test,y_test))

Train score:  0.9437588989084006

Test score:  0.888045540796964


## XGBoost

In [237]:
from xgboost import XGBClassifier as xgc

xg_class = xgc()
xg_class.fit(X_train,y_train)
print("Train score: ",xg_class.score(X_train,y_train))
print("Test score: ",xg_class.score(X_test,y_test))

Train score:  0.8464641670621738
Test score:  0.8320683111954459


  if diff:
  if diff:


## Challenge 0 (SVM & Cross-Validation)

In [238]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)
print("Train score ",svc.score(X_train,y_train))
print("Test score ",svc.score(X_test,y_test))

Train score  0.6976744186046512
Test score  0.6821631878557874


In [239]:
# cross val using SVC
#from sklearn.model_selection import cross_val_score
#cvs = cross_val_score(estimator=svc,X=X,y=y,cv=10)

In [240]:
#cvs

## Challenge 1 (Compare model to new book)

In [241]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [242]:
# shakespeare
shake = gutenberg.raw('shakespeare-macbeth.txt')

In [243]:
shake = re.sub(r'CHAPTER .*', '', shake)

In [244]:
shake_sent = nltk.sent_tokenize(shake)

In [245]:
sent_s = create_df(shake_sent)
sent_s["text_source"] = "Shakespeare"

In [246]:
sent_df2 = pd.concat([sent_s,sent_a])

In [247]:
# bag of words
corpus2 = []
for sent in sent_df2["text_sentence"].tolist():
    cleaned_sentence2 = text_cleaner(sent)
    corpus2.append(cleaned_sentence2)

In [248]:
sent_df2.head(3)

Unnamed: 0,text_sentence,text_source
0,[The Tragedie of Macbeth by William Shakespear...,Shakespeare
1,Scoena Prima.,Shakespeare
2,Thunder and Lightning.,Shakespeare


In [249]:
sent_df2["text_source"].unique()

array(['Shakespeare', 'Carroll'], dtype=object)

In [250]:
count_v2 = CountVectorizer(max_features=2000)

In [251]:
X2 = count_v2.fit_transform(corpus2).toarray()

In [252]:
y2 = sent_df2.iloc[:,1].values

In [253]:
X_shake_train,X_shake_test,y_shake_train,y_shake_test = train_test_split(X2,y2,test_size=0.20,random_state=42)

In [254]:
# Random Forest
print("\nTest score: ",rfc.score(X_shake_test,y_shake_test))


Test score:  0.2418831168831169


In [255]:
# Logistic Regression
print("\nTest score: ",lr.score(X_shake_test,y_shake_test))


Test score:  0.05357142857142857


How to test model performance using words from another text?