In [2]:
#Use spaCy library: tools to perform analysis on natural language
#Create pipeline to extract word features and classify tweets using Naive Bayes
#Set up Transformer class, just extracts words in the document, empty function that returns self (to use scikit-learn API)
#Extract each words from the document and record "True" if discovered, "False" if not (dictionary)
import spacy
from sklearn.base import TransformerMixin

# Create a spaCy parser
nlp = spacy.load('en')


class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document, tag=False, parse=False, entity=False)):
                if len(word.text.strip()):
                    row[word.text] = True
            results.append(row)
        return results

In [3]:
#Uses DictVectorizer class to convert the dictionaries into a Matrix
#Features in Matrix = Keys of Dictionaries, Values = Occurence of Features In Sample
#Words are keys, only occurs if the words occurs in the tweet
from sklearn.feature_extraction import DictVectorizer

In [4]:
#Set up classifier and use Naive Bayes
#Use BernoulliNB, contains binary features, adds to pipeline
from sklearn.naive_bayes import BernoulliNB

In [5]:
#Set filenames and loads dataset and classes
#Set filenames for tweets themselves and labels
import os
input_filename = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Data Mining", "Chapter6HW", "Data/Datasets", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Data Mining", "Chapter6HW", "Data/Datasets", "twitter", "python_classes.json")

In [6]:
#Loads the tweets , only extract the text values
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\JRuiz\\OneDrive\\Desktop\\Data Mining\\Chapter6HW\\Data/Datasets\\twitter\\python_classes.json'

In [None]:
#Creates a pipeline by putting together previous components
#NLTKBOW transformer, DictVectorizer Transformer, BernoulliNB Classifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

In [None]:
#F1-Score: F-Measure: Precision and Recall
#Precision: Percent of Samples that were predicted to belong to a specific class, that were actually in that class
#Recall: Percent of Samples in the dataset that are in a class and actually labelled to be in that class
#Set scoring parameter to f1, label 1
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
#We then print out the average of the scores:
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))

In [None]:
#Fit pipeline with tweets to create new model
model = pipeline.fit(tweets, labels)

In [None]:
#Creates access to the individual steps using named_steps attribute ad name of step: Naive Bayes Model
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

In [None]:
#Stored as log probabilities log(P(A|f)) f is given feature
#Sort the array of logarithm probabilities in descending order, negate values
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

In [None]:
#Map the feature's indices to actual values
#Use DictVectorizer step , makes matrices and also records the mapping
#Find feature names that correspond to the columns
#Extract features from Pipeline
dv = model.named_steps['vectorizer']

In [None]:
#Prints out the names of the top features using feature_names_ attribute
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

In [8]:
#from sklearn.externals 
import joblib
output_filename = os.path.join(os.path.expanduser("~"), "OneDrive", "Desktop", "Data Mining", "Chapter7HW", "Models", "twitter", "python_context.pkl")

In [9]:
joblib.dump(model, output_filename)

NameError: name 'model' is not defined