In [44]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
import scipy
import helpers
import types
from sklearn import metrics

In [5]:
X = pd.read_pickle('X_train.pkl')

In [6]:
y = pd.read_pickle('y_train.pkl')

In [9]:
X

35     Chief Justice Roberts, President Carter, Pres...
13     Fellow-Citizens:     We have assembled to rep...
26     My friends, before I begin the expression of ...
30     Senator Hatfield, Mr. Chief Justice, Mr. Pres...
16     My Fellow-Citizens:     When we assembled her...
31     Mr. Chief Justice, Mr. President, Vice Presid...
21     My Countrymen:     This occasion is not alone...
12     Fellow-Citizens:     Under Providence I have ...
8      Called from a retirement which I had supposed...
17     My fellow-citizens, no people on earth have m...
9      Fellow-Citizens:    Without solicitation on m...
34     My fellow citizens:  I stand here today humbl...
0      Fellow Citizens:   I am again called upon by ...
4      I should be destitute of feeling if I was not...
29     For myself and for our Nation, I want to than...
15     Fellow-Citizens:     In the presence of this ...
19      There has been a change of government. It be...
5      In compliance with an usage coeval with t

In [10]:
y

35     R
13     R
26     R
30     R
16     R
31     R
21     R
12     R
8      W
17     R
9      D
34     D
0      F
4     DR
29     D
15     D
19     D
5     DR
11     R
1      F
24     D
2     DR
33     R
3     DR
32     D
23     D
27     D
10     D
22     D
18     R
25     D
6      D
20     R
7      D
14     R
28     R
Name: 1, dtype: object

In [11]:
le = LabelEncoder()

y = le.fit_transform(y)

In [13]:
y

array([3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 0, 0, 2, 1, 0, 0, 0, 1, 3, 2, 0, 1,
       3, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 3])

In [14]:
# Generate a list of stopwords
stopwords_list = stopwords.words('english') + list(string.punctuation)
# Generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stopwords_list, random_state=42):
    '''
    Generate train and test TF-IDF vectorization for our data set
    Parameters
    ----------
    X: pandas.Series object
        Pandas series of text documents to classify
    y : pandas.Series object
        Pandas series containing label for each document
    stopwords_list: list ojbect
        List containing words and punctuation to remove.
    Returns
    --------
    tf_idf_train :  sparse matrix, [n_train_samples, n_features]
        Vector representation of train data
    tf_idf_test :  sparse matrix, [n_test_samples, n_features]
        Vector representation of test data
    y_train : array-like object
        labels for training data
    y_test : array-like object
        labels for testing data
    vectorizer : vectorizer object
        fit TF-IDF vecotrizer object
    '''
    # Generate a train test split of X and y
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    #This tool removes case punctuation, numbers, and stopwords
    tfidf = TfidfVectorizer(token_pattern=r"([a-zA-Z]+(?:[a-z]+)?)", stop_words=stopwords_list)
    #Fitting and transforming on training set
    tf_idf_train = tfidf.fit_transform(X_train)
    #transforming on testing set
    tf_idf_test = tfidf.transform(X_test)
    #Return
    return tf_idf_train, tf_idf_test, y_train, y_test, tfidf

In [15]:
tf_idf_train, tf_idf_test, y_train, y_test, vectorizer = tfidf(X, y, stopwords_list)

In [16]:
# create a function that takes in a classifier, trains it on our tf-idf vectors,
# and generates train and test predictiions
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    '''
    Train a classifier to identify whether a message is spam or ham
    Parameters
    ----------
    classifier: sklearn classifier
       initialized sklearn classifier (MultinomialNB, RandomForestClassifier, etc.)
    tf_idf_train : sparse matrix, [n_train_samples, n_features]
        TF-IDF vectorization of train data
    tf_idf_test : sparse matrix, [n_test_samples, n_features]
        TF-IDF vectorization of test data
    y_train : pandas.Series object
        Pandas series containing label for each document in the train set
    Returns
    --------
    train_preds :  list object
        Predictions for train data
    test_preds :  list object
        Predictions for test data
    '''
    # a) fit the classifier with our training data
    classifier.fit(tf_idf_train, y_train)
    # b) predict the labels of our train data and store them in train_preds
    train_preds = classifier.predict(tf_idf_train)
    # c) predict the labels of our test data and store them in test_preds
    test_preds = classifier.predict(tf_idf_test)
    # d) return train_preds and test_preds
    return train_preds, test_preds

In [17]:
nb = MultinomialNB()

In [18]:
nb_train_preds, nb_test_preds = classify_text(nb, tf_idf_train,
                                              tf_idf_test, y_train)

In [21]:
helpers.metrics_printout(y_test, nb_test_preds, rf_test_preds)

AttributeError: module 'helpers' has no attribute 'metrics_printout'

In [22]:
accuracy_score(y_train, nb_train_preds)

0.5925925925925926

In [24]:
accuracy_score(y_test, nb_test_preds)

0.1111111111111111

In [38]:
rfc =RandomForestClassifier(n_estimators=200)

In [39]:
rfc_train_preds, rfc_test_preds = classify_text(rfc, tf_idf_train,
                                              tf_idf_test, y_train)

In [40]:
accuracy_score(y_train, rfc_train_preds)

1.0

In [41]:
accuracy_score(y_test, rfc_test_preds)

0.2222222222222222

In [None]:
from sklearn.model_selection import GridSearchCV

cv = GridSearchCV()

In [45]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)
# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tf_idf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tf_idf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score
# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.1111111111111111

Alpha:  0.1
Score:  0.1111111111111111

Alpha:  0.2
Score:  0.1111111111111111

Alpha:  0.30000000000000004
Score:  0.1111111111111111

Alpha:  0.4
Score:  0.1111111111111111

Alpha:  0.5
Score:  0.1111111111111111

Alpha:  0.6000000000000001
Score:  0.1111111111111111

Alpha:  0.7000000000000001
Score:  0.1111111111111111

Alpha:  0.8
Score:  0.1111111111111111

Alpha:  0.9
Score:  0.1111111111111111



  'setting alpha = %.1e' % _ALPHA_MIN)


In [46]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(tf_idf_train, y_train)
# Create the predicted tags: pred
pred = nb_classifier.predict(tf_idf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.1111111111111111
[[0 0 0 4 0]
 [0 0 0 2 0]
 [0 0 0 1 0]
 [0 0 0 1 0]
 [0 0 0 1 0]]


In [49]:
print(tf_idf_train)

  (0, 9)	0.01582304685787793
  (0, 17)	0.026451448533411803
  (0, 24)	0.015111275922459927
  (0, 37)	0.022272790010323062
  (0, 52)	0.01582304685787793
  (0, 60)	0.048196300184983706
  (0, 71)	0.052902897066823606
  (0, 76)	0.029768236397967787
  (0, 81)	0.044545580020646125
  (0, 90)	0.03827393244461992
  (0, 113)	0.026451448533411803
  (0, 128)	0.020781362227935866
  (0, 131)	0.036786166869385226
  (0, 133)	0.024098150092491853
  (0, 136)	0.024098150092491853
  (0, 138)	0.048196300184983706
  (0, 139)	0.017464574363379875
  (0, 141)	0.026451448533411803
  (0, 145)	0.01660270370484712
  (0, 146)	0.01582304685787793
  (0, 150)	0.024098150092491853
  (0, 157)	0.026451448533411803
  (0, 165)	0.015111275922459927
  (0, 180)	0.029768236397967787
  (0, 184)	0.029768236397967787
  :	:
  (26, 6649)	0.025130424248545478
  (26, 6652)	0.033952053710905
  (26, 6661)	0.02241685048267132
  (26, 6678)	0.02606389298978702
  (26, 6679)	0.02858848965367639
  (26, 6686)	0.019396247850532695
  (26, 6703)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tf_idf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
# Transform the training data: tfidf_train 
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
# Transform the test data: tfidf_test 
tf_idf_test = tf_idf_vectorizer.transform(X_test)
# Print the first 10 features
print(tf_idf_vectorizer.get_feature_names()[:10])
# Print the first 5 vectors of the tfidf training data
print(tf_idf_train.A[:5])

['000', '000people', '100', '15th', '16', '1774', '1776', '1778', '1780', '1787']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [54]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(tf_idf_train, y_train)
# Create the predicted tags: pred
pred = nb_classifier.predict(tf_idf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.5
[[1 0 1 0]
 [2 0 0 0]
 [0 0 3 0]
 [1 0 0 0]]


In [55]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)
# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tf_idf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tf_idf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score
# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.375

Alpha:  0.1
Score:  0.375

Alpha:  0.2
Score:  0.5

Alpha:  0.30000000000000004
Score:  0.5

Alpha:  0.4
Score:  0.5

Alpha:  0.5
Score:  0.5

Alpha:  0.6000000000000001
Score:  0.5

Alpha:  0.7000000000000001
Score:  0.5

Alpha:  0.8
Score:  0.5

Alpha:  0.9
Score:  0.5



  'setting alpha = %.1e' % _ALPHA_MIN)


In [56]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
rfc_classifier = RandomForestClassifier()
# Fit the classifier to the training data
rfc_classifier.fit(tf_idf_train, y_train)
# Create the predicted tags: pred
pred = rfc_classifier.predict(tf_idf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.5
[[2 0 0 0]
 [2 0 0 0]
 [1 0 2 0]
 [0 0 1 0]]


In [57]:
from sklearn.model_selection import GridSearchCV
# # random forest
g4 = {
    'max_depth': [ 2, 5, 10 ],
    'n_estimators': [ 100, 1000, 2000]
}
gs4 = GridSearchCV(RandomForestClassifier(), g4)
gs4.fit(tf_idf_train, y_train)



GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 10],
                         'n_estimators': [100, 1000, 2000]})

In [59]:
best_rfc = gs4.best_estimator_
best_rfc

RandomForestClassifier(max_depth=2, n_estimators=1000)

In [60]:
# Create the predicted tags: pred
pred = best_rfc.predict(tf_idf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.375
[[2 0 0 0]
 [2 0 0 0]
 [2 0 1 0]
 [0 0 1 0]]


In [61]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
rfc_classifier = RandomForestClassifier(n_estimators=1000, max_depth=2)
# Fit the classifier to the training data
rfc_classifier.fit(tf_idf_train, y_train)
# Create the predicted tags: pred
pred = rfc_classifier.predict(tf_idf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.5
[[2 0 0 0]
 [2 0 0 0]
 [1 0 2 0]
 [0 0 1 0]]


In [64]:
# Import the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['000', '000people', '100', '15th', '16', '1774', '1776', '1778', '1780', '1787']


In [65]:
# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)
# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.5
[[1 0 1 0]
 [2 0 0 0]
 [0 0 3 0]
 [1 0 0 0]]


In [67]:
# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
rfc_classifier = RandomForestClassifier(n_estimators=1000)
# Fit the classifier to the training data
rfc_classifier.fit(count_train, y_train)
# Create the predicted tags: pred
pred = rfc_classifier.predict(count_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.375
[[2 0 0 0]
 [2 0 0 0]
 [2 0 1 0]
 [0 0 1 0]]


In [70]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jonhickey/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [92]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = 'english', # works
                                lowercase = True, # works
                                max_df = 0.5, # works
                                min_df = 10)
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

  'stop_words.' % sorted(inconsistent))


['?', 'abroad', 'advance', 'affair', 'age', 'ago', 'almighty', 'ancient', 'ask', 'assume']


In [93]:
# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)
# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.625
[[2 0 0 0 0]
 [1 0 1 0 0]
 [0 0 0 0 0]
 [0 0 0 3 0]
 [0 0 1 0 0]]


In [83]:
from sklearn.preprocessing  import LabelEncoder

decoded_array = le.inverse_transform(pred)
print(decoded_array)

['D' 'R' 'R' 'D' 'R' 'R' 'R' 'D']


In [84]:
final_test = pd.read_pickle('X_test.pkl')

In [87]:
final_test

0       Fellow-Citizens of the Senate and of the Hou...
13      My Countrymen:     When one surveys the worl...
8      Citizens of the United States:     Your suffr...
1       Proceeding, fellow citizens, to that qualifi...
15     THE PRICE OF PEACE     Mr. Chairman, Mr. Vice...
5      Elected by the American people to the highest...
20     President Clinton, distinguished guests and m...
11     Fellow-Citizens:     In obedience to the will...
3      I shall not attempt to describe the grateful ...
4       Fellow-Citizens:   About to undertake the ar...
17     Mr. Vice President, Mr. Speaker, Mr. Chief Ju...
12      My Fellow Citizens:     The four years which...
18     Senator Mathias, Chief Justice Burger, Vice P...
16     Vice President Johnson, Mr. Speaker, Mr. Chie...
2       March 4, 1809   Unwilling to depart from exa...
9      Fellow-Citizens:     There is no constitution...
21     Vice President Biden, Mr. Chief Justice,  mem...
7       At this second appearing to take the  oa

In [101]:
# Transform the test data using only the 'text' column values: count_test 
final_test = count_vectorizer.transform(final_test)

In [102]:
preds = nb_classifier.predict(final_test)

In [104]:
from sklearn.preprocessing  import LabelEncoder

decoded_array = le.inverse_transform(preds)
print(decoded_array)

['DR' 'R' 'R' 'DR' 'D' 'F' 'D' 'R' 'F' 'F' 'R' 'R' 'D' 'R' 'F' 'F' 'D' 'D'
 'D' 'R' 'R' 'R']


In [129]:
with open('sr_jh_predictions.pkl', 'wb') as f:
    pkl.dump(decoded_array, f)

In [115]:
df = pd.DataFrame(decoded_array).astype(str)
df[0] = df[0].astype(str)

In [119]:
df.to_pickle('sr_jh_predictions.pkl')

In [124]:
type(df[0][0])

str

In [126]:
import pickle as pkl
import numpy as np

arrayInput = decoded_array #Trial input
save = True
load = True

fileName = 'sr_jh_predictions.pkl'
fileObject = open(fileName, 'wb')

if save:
    pkl.dump(arrayInput, fileObject)
    fileObject.close()

if load:
    fileObject2 = open(fileName, 'wb')
    modelInput = pkl.load(fileObject2)
    fileObject2.close()

if arrayInput == modelInput:
    Print(True)

UnsupportedOperation: read

In [None]:
import pickle as pkl

pkl.dump(decoded_array)