In [1]:
!pip install spacy
!python -m spacy download 'en'

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [0]:
#Loading political theory text 
from urllib import request

plato_url = "http://www.gutenberg.org/cache/epub/150/pg150.txt"
response = request.urlopen(plato_url)
republic = response.read().decode('utf8')

marx_url = "http://www.gutenberg.org/cache/epub/61/pg61.txt"
response2 = request.urlopen(marx_url)
marx = response2.read().decode('utf8')

lev_url = "http://www.gutenberg.org/cache/epub/3207/pg3207.txt"
response4 = request.urlopen(lev_url)
lev = response4.read().decode('utf8')

rou_url = "http://www.gutenberg.org/files/46333/46333-0.txt"
response5 = request.urlopen(rou_url)
jrouss = response5.read().decode('utf8') 

paine_url = "http://www.gutenberg.org/cache/epub/147/pg147.txt"
response7 = request.urlopen(paine_url)
paine = response7.read().decode('utf8')

In [0]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--''
    text = re.sub(r'--',' ',text)
    text = re.sub(r'--',' ',text)
    text = re.sub("[@#+%*:()'-]", ' ', text)
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    
    text = ' '.join(text.split())
    return text
  
     
# Load and clean the data
republic = re.sub(r'Chapter \d+', '', republic)
marx = re.sub(r'Chapter \d+', '', marx)
lev = re.sub(r'Chapter \d+', '', lev)
jrouss = re.sub(r'Chapter \d+', '', jrouss)
paine = re.sub(r'Chapter \d+', '', paine)



# drop the first few words here 

republic = text_cleaner(republic[:9000])
marx = text_cleaner(marx[:9000])
lev = text_cleaner(lev[:9000])
jrouss = text_cleaner(jrouss[:9000])
paine = text_cleaner(paine[:9000])






In [0]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')

republic_doc = nlp(republic)
marx_doc = nlp(marx)
lev_doc = nlp(lev)
jrouss_doc = nlp(jrouss)
paine_doc = nlp(paine)


In [0]:
plato_sents = [[sent, "plato"] for sent in republic_doc.sents]
del plato_sents[0:18]




# Group into sentences.
marx_sents = [[sent, "marx"] for sent in marx_doc.sents]
del marx_sents[0:6]


hob_sents = [[sent, "hobb"] for sent in lev_doc.sents]
del hob_sents[0:18]



jrouss_sents = [[sent, "rouss"] for sent in jrouss_doc.sents]
del jrouss_sents[0:18]


paine_sents = [[sent, 'paine'] for sent in paine_doc.sents]
del paine_sents[0:18]

In [0]:
sentences = pd.DataFrame(plato_sents + jrouss_sents + hob_sents  + marx_sents  + paine_sents )

In [0]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

 # Set up the bags.
platowords = bag_of_words(republic_doc)
marxwords = bag_of_words(marx_doc)
jrwords = bag_of_words(jrouss_doc)

hobwords = bag_of_words(lev_doc)
painewords = bag_of_words(paine_doc) 




# Combine bags to create a set of unique words.
common_words = set(platowords + marxwords + jrwords + hobwords)

In [9]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250


Unnamed: 0,longer,Jean,problem,LIBERTY,Sons,largeness,confusion,surpass,clearly,spy,exception,USE,//www.gutenberg.org,Polemarchus,Solon,Bunyan,nobility,shine,Enemy,Contract,meet,fervour,largely,conversation,Classics,fragment,Character,find,group,EXTENUATIONS,Brother,condition,openly,heaven,probably,pave,environment,Indian,remark,spirit,...,imaginary,arrangement,contemporary,meaningless,compel,America,basis,italian,servant,21,moderate,downward,morality,reappear,mankind,roman,birth,biblical,certain,G.D.H.,proportion,29,project,Civill,January,B.C.,testimony,submission,Dante,frozen,punctuation,curious,child,manufacturing,Republica,Berkeley,proof,reign,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"(THE, INTRODUCTION, THE, Republic, of, Plato, ...",plato
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"(There, are, nearer, approaches, to, modern, m...",plato
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"(But, no, other, Dialogue, of, Plato, has, the...",plato
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"(Nowhere, in, Plato, is, there, a, deeper, iro...",plato
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"(Nor, in, any, other, of, his, writings, is, t...",plato


In [0]:
y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))


In [0]:
# Model 3: Random Forest gridsearchcv  
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
rfc = ensemble.RandomForestClassifier()

y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

param_grid = { 
    'n_estimators': [15, 200, 500, 750, 1000, 1500],
    'max_features': [1,2,4,6,7,8],
    'max_depth': [4,5,6,7,8]
}


grid = GridSearchCV(rfc, param_grid, cv=5, verbose=3, n_jobs=-1)

grid.fit(X,y)

# Show the best parameter and best score 
print(grid.best_params_)
print( grid.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  6.2min


{'max_depth': 8, 'max_features': 7, 'n_estimators': 15}
0.4115384615384615


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  7.1min finished


In [0]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split


rfc = ensemble.RandomForestClassifier(n_estimators=15, max_features= 7, max_depth=8)
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))



from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, Y ,test_size=0.2, random_state=100)
print('With 20% Holdout: ' + str(rfc.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(rfc.fit(X, Y).score(X, Y)))


# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(rfc, X, Y, cv=5))


from sklearn.metrics import classification_report
print('Random Forest report :')
print(classification_report(y_test, rfc.predict(X_test)))

With 20% Holdout: 0.4423076923076923
Testing on Sample: 0.4653846153846154
[0.31481481 0.36538462 0.40384615 0.34615385 0.38      ]
Random Forest report :
              precision    recall  f1-score   support

        hobb       0.45      1.00      0.62        19
        marx       1.00      0.50      0.67        10
       paine       1.00      0.08      0.15        12
       plato       1.00      0.33      0.50         6
       rouss       1.00      0.40      0.57         5

    accuracy                           0.56        52
   macro avg       0.89      0.46      0.50        52
weighted avg       0.80      0.56      0.50        52



## Logistic Regression


In [0]:
# Logistic Regression Gridsearch cv 
from sklearn.linear_model import LogisticRegression


y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

logr = LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = (0.0001,0.001, 0.01, 0.1, 1, 10, 100,1000)

# Create hyperparameter options
parameters = dict(C=C, penalty=penalty)

# Use GS-CV to see which alpha level is best.

logr_grid = GridSearchCV(logr, parameters, cv=5, verbose=1)

#Fit the logistic regression 
logr_grid.fit(X, y)

#return best parameters and best score

print(logr_grid.best_params_)
print(logr_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 10, 'penalty': 'l2'}
0.6538461538461539


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.9s finished


In [0]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=10,penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.

Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))


from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, Y ,test_size=0.4, random_state=100)
print('With 20% Holdout: ' + str(lr.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(lr.fit(X, Y).score(X, Y)))





# Cross validating using 5 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(lr, X, Y, cv=5))


from sklearn.metrics import classification_report
print('Logistic Regression Report :')
print(classification_report(y_test, lr.predict(X_test)))



With 20% Holdout: 0.6826923076923077
Testing on Sample: 0.9769230769230769
[0.53703704 0.61538462 0.63461538 0.78846154 0.7       ]
Logistic Regression Report :
              precision    recall  f1-score   support

        hobb       0.90      1.00      0.95        36
        marx       1.00      0.95      0.98        22
       paine       1.00      0.89      0.94        18
       plato       1.00      1.00      1.00        14
       rouss       1.00      0.93      0.96        14

    accuracy                           0.96       104
   macro avg       0.98      0.95      0.97       104
weighted avg       0.97      0.96      0.96       104





## Decision Tree 

In [0]:
# Decision Tree Gridsearch 
from sklearn.tree import DecisionTreeClassifier

Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))


# Initialize the model
decision_tree = DecisionTreeClassifier()

# Set parameters for dtc
params = [{'max_features': [2, 4, 6, 8],
             'max_depth': [2, 4, 6, 8]}]

# Search for the best paramters. 
decision_tree_grid = GridSearchCV(decision_tree, params, cv=5, verbose=1, n_jobs=-1)

# Fit the grid and obtain results
decision_tree_grid.fit(X, Y)

# Return best parameters and best score
print(decision_tree_grid.best_params_)
print(decision_tree_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.4s


{'max_depth': 8, 'max_features': 6}
0.36538461538461536


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    6.1s finished


In [0]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))


decision_tree = DecisionTreeClassifier( 
    criterion='entropy',
    max_features=6,
    max_depth=8)


from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(decision_tree.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(decision_tree.fit(X, Y).score(X, Y)))


# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(decision_tree, X, Y, cv=10))


from sklearn.metrics import classification_report
print('Decision Tree report :')
print(classification_report(y_test, decision_tree.predict(X_test)))

With 20% Holdout: 0.36538461538461536
Testing on Sample: 0.4269230769230769
[0.32142857 0.32142857 0.35714286 0.32142857 0.33333333 0.34615385
 0.33333333 0.33333333 0.375      0.43478261]
Decision Tree report :
              precision    recall  f1-score   support

        hobb       0.39      1.00      0.56        17
        marx       1.00      0.23      0.38        13
       paine       1.00      0.33      0.50         6
       plato       1.00      0.14      0.25         7
       rouss       1.00      0.22      0.36         9

    accuracy                           0.48        52
   macro avg       0.88      0.39      0.41        52
weighted avg       0.80      0.48      0.43        52



## Naive Bayes


In [0]:
# Native Bayes 

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split


y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))



# Instantiate our model and Fit our model to the data.
bnb = BernoulliNB()
bnb.fit(X, y)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(X, y).score(X, y)))

# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(bnb,X , y, cv=10))

#Classification report 
from sklearn.metrics import classification_report
print('Native Bayes Classification report :')
print(classification_report(y_test, bnb.predict(X_test)))

With 20% Holdout: 0.34615384615384615
Testing on Sample: 0.5769230769230769
[0.32142857 0.32142857 0.46428571 0.35714286 0.37037037 0.42307692
 0.45833333 0.375      0.45833333 0.39130435]
Native Bayes Classification report :
              precision    recall  f1-score   support

        hobb       0.42      1.00      0.60        17
        marx       1.00      0.46      0.63        13
       paine       0.00      0.00      0.00         6
       plato       1.00      0.29      0.44         7
       rouss       1.00      0.44      0.62         9

    accuracy                           0.56        52
   macro avg       0.68      0.44      0.46        52
weighted avg       0.70      0.56      0.52        52



  'precision', 'predicted', average, warn_for)


## KNN

In [0]:
# Model 2: KNN gridsearch
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV 

# Initialize the model
knn = KNeighborsClassifier()


y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

# Set parameters for KNN
# List of values to try 
knn_params = [{'n_neighbors': [2,3,5,7,10,15,25,100]}]

#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, 
    #n_jobs=None, iid=’warn’, refit=True, cv=’warn’, verbose=0, pre_dispatch=‘2*n_jobs’,
    #error_score=’raise-deprecating’, return_train_score=’warn’)

# Search for the best paramters. 
knn_grid = GridSearchCV(knn, knn_params, cv=5, verbose=1, n_jobs=-1)

# Fit the grid and obtain results
knn_grid.fit(X, y)

# Return best parameters and best score
print(knn_grid.best_params_)
print(knn_grid.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 10}
0.3576923076923077


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.4s finished


In [0]:
# Nearest neighbors model 
knn = KNeighborsClassifier(n_neighbors=10)

y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

knn.fit(X,y)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(knn.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(knn.fit(X, y).score(X, y)))

# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X, y, cv=10))

from sklearn.metrics import classification_report
print('KNN report :')
print(classification_report(y_test, knn.predict(X_test)))

With 20% Holdout: 0.15384615384615385
Testing on Sample: 0.6576923076923077
[0.32142857 0.35714286 0.28571429 0.39285714 0.37037037 0.34615385
 0.375      0.375      0.41666667 0.47826087]
KNN report :
              precision    recall  f1-score   support

        hobb       0.61      1.00      0.76        17
        marx       1.00      0.38      0.56        13
       paine       0.38      0.83      0.53         6
       plato       1.00      0.43      0.60         7
       rouss       1.00      0.33      0.50         9

    accuracy                           0.63        52
   macro avg       0.80      0.60      0.59        52
weighted avg       0.80      0.63      0.61        52



## SVM

In [0]:
# SVM model using features 
from sklearn.svm import SVC
svm = SVC(C=10, gamma=.1)


y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

# AttributeError: predict_proba is not available when  probability=False -> ERROR FOR SVM AUC 


svm.fit(X,y)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=20)
print('With 20% Holdout: ' + str(svm.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(svm.fit(X, y).score(X, y)))

# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(svm,X,y, cv=5))


from sklearn.metrics import classification_report
print('SVM report :')
print(classification_report(y_test, svm.predict(X_test)))



one or two that work 

With 20% Holdout: 0.5192307692307693
Testing on Sample: 0.9769230769230769
[0.37037037 0.53846154 0.57692308 0.61538462 0.66      ]
SVM report :
              precision    recall  f1-score   support

        hobb       0.97      1.00      0.99        35
        marx       1.00      1.00      1.00        23
       paine       1.00      1.00      1.00        15
       plato       1.00      1.00      1.00        13
       rouss       1.00      0.94      0.97        18

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



## Gradient Boosted 

In [0]:
# Parameters to test in gridsearch cv 

params = {'n_estimators': [50, 100, 150, 200, 300, 500,750],
                           
          'max_depth': [4,5,6,7,8],
         }


y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))



# Initialize and fit the model.
gb = ensemble.GradientBoostingClassifier()

# Use the grid
gb_grid = GridSearchCV(gb, params, cv=3, verbose=1, n_jobs=-1)

# Fit the grid
gb_grid.fit(X, y)

# Return best parameters and best score
print(gb_grid.best_params_)
print(gb_grid.best_score_)

Fitting 3 folds for each of 35 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed: 12.7min finished


{'max_depth': 4, 'n_estimators': 50}
0.5384615384615384


In [0]:
# Gradient Boosted model 
from sklearn import ensemble
gb = ensemble.GradientBoostingClassifier(
              max_features=6,
              max_depth=8)

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, Y ,test_size=0.2, random_state=100)
print('With 20% Holdout: ' + str(gb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(gb.fit(X, Y).score(X, Y)))


# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(gb,X, Y, cv=10))


from sklearn.metrics import classification_report
print('Gradient Boosted report :')
print(classification_report(y_test, gb.predict(X_test)))




With 20% Holdout: 0.6538461538461539
Testing on Sample: 0.9692307692307692
[0.39285714 0.42857143 0.42857143 0.53571429 0.59259259 0.65384615
 0.70833333 0.625      0.66666667 0.69565217]
Gradient Boosted report :
              precision    recall  f1-score   support

        hobb       0.83      1.00      0.90        19
        marx       1.00      0.90      0.95        10
       paine       1.00      0.83      0.91        12
       plato       1.00      0.83      0.91         6
       rouss       1.00      1.00      1.00         5

    accuracy                           0.92        52
   macro avg       0.97      0.91      0.93        52
weighted avg       0.94      0.92      0.92        52



## Using tf-idf

In [11]:
#Create vectorizer model in order to get tf-idf for each sentence
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=False, #convert everything to lower casefor
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.Prevents divide-by-zero errors
                            )

#convert from spacy object to string
sentence_list = word_counts['text_sentence'].astype(str)
print(type(sentence_list))

#vectorizer model
text_tfidf = vectorizer.fit_transform(sentence_list)
print(type(text_tfidf))

<class 'pandas.core.series.Series'>
<class 'scipy.sparse.csr.csr_matrix'>


In [12]:
#List of features
features = vectorizer.get_feature_names()

#Shape
n = text_tfidf.shape[0]


tfidf_bysent = [{} for _ in range(0,n)]

#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*text_tfidf.nonzero()):
    tfidf_bysent[i][features[j]] = text_tfidf[i, j]

#Show first dictionary
display(tfidf_bysent[3])
print(type(tfidf_bysent))

#Create dataframe for this feature set
tfidf_df = pd.DataFrame(columns=features)
tfidf_df['text_sentence'] = word_counts['text_sentence']
tfidf_df['text_source'] = word_counts['text_source']
tfidf_df.loc[:, features] = 0

counter = 0
for i in tfidf_bysent:
    for k, v in i.items():
        tfidf_df.loc[counter, k] = v
    counter = counter + 1
print('done!')

{'Plato': 0.34897702456055063,
 'deeper': 0.4859239060486708,
 'greater': 0.46034872378959635,
 'power': 0.4405110712512908,
 'wealth': 0.4859239060486708}

<class 'list'>
done!


In [13]:
# SVM gridsearch for best parameters 
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
y = tfidf_df['text_source']
X = tfidf_df.drop(['text_sentence','text_source'], 1)

svm = SVC()

# new parameters for this model
svc_params = [{'C': [.000001,.00001,.001,.01,.1,1,10,100], 'gamma': [.0001,.001,.01,.1]}]

# setting up the grid
svc_grid = GridSearchCV(svm, svc_params, cv=3, verbose=1, n_jobs=-1)

#Fit the grid
svc_grid.fit(X,y)

#return best parameters and best score

print(svc_grid.best_params_)
print(svc_grid.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    4.0s


{'C': 100, 'gamma': 0.1}
0.6730769230769231


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    5.1s finished


In [15]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
svm = SVC(C=100, gamma=.1)

y = tfidf_df['text_source']
X = tfidf_df.drop(['text_sentence','text_source'], 1)

# AttributeError: predict_proba is not available when  probability=False -> ERROR FOR SVM AUC 


svm.fit(X,y)

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=20)
print('With 20% Holdout: ' + str(svm.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(svm.fit(X, y).score(X, y)))

# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(svm,X,y, cv=5))


from sklearn.metrics import classification_report
print('SVM report :')
print(classification_report(y_test, svm.predict(X_test)))

With 20% Holdout: 0.75
Testing on Sample: 0.9769230769230769
[0.59259259 0.78846154 0.69230769 0.84615385 0.74      ]
SVM report :
              precision    recall  f1-score   support

        hobb       0.95      1.00      0.97        35
        marx       1.00      1.00      1.00        23
       paine       1.00      1.00      1.00        15
       plato       1.00      0.92      0.96        13
       rouss       1.00      0.94      0.97        18

    accuracy                           0.98       104
   macro avg       0.99      0.97      0.98       104
weighted avg       0.98      0.98      0.98       104



In [16]:
from sklearn import ensemble
gb = ensemble.GradientBoostingClassifier()


y = tfidf_df['text_source']
X = tfidf_df.drop(['text_sentence','text_source'], 1)

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=20)
print('With 20% Holdout: ' + str(gb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(gb.fit(X, y).score(X, y)))

# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(gb,X, y, cv=5))


from sklearn.metrics import classification_report
print('Gradient Boosted report :')
print(classification_report(y_test, gb.predict(X_test)))

With 20% Holdout: 0.6826923076923077
Testing on Sample: 0.9730769230769231
[0.5        0.55769231 0.63461538 0.80769231 0.72      ]
Gradient Boosted report :
              precision    recall  f1-score   support

        hobb       1.00      0.97      0.99        35
        marx       0.88      1.00      0.94        23
       paine       1.00      1.00      1.00        15
       plato       1.00      0.92      0.96        13
       rouss       1.00      0.94      0.97        18

    accuracy                           0.97       104
   macro avg       0.98      0.97      0.97       104
weighted avg       0.97      0.97      0.97       104



In [0]:
# Gradient Boosted model 
from sklearn import ensemble
gb = ensemble.GradientBoostingClassifier(
              max_features=6,
              max_depth=8)

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, Y ,test_size=0.2, random_state=100)
print('With 20% Holdout: ' + str(gb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(gb.fit(X, Y).score(X, Y)))


# Cross validating using 10 folds  
from sklearn.model_selection import cross_val_score
print(cross_val_score(gb,X, Y, cv=10))


from sklearn.metrics import classification_report
print('Gradient Boosted report :')
print(classification_report(y_test, gb.predict(X_test)))

After BoW to generate features and applying them to supervised learning methods, I found the the ensemble models, Gradient boosted and SVM, were the best performing models and the ones to improve upon. To boost the performance I used TF-IDF for feature generation and then applied them to my choosen SVM and graident boosted models. The tidf method boosted the performance of both models in different ways, with the largest gain in the performance of the holdout group for the SVM model. 