## Course: DSC550
## Assignment: 9.3 Exercise
## Name: Laura Hoffmann
## Date: 5/22/2021

#### Import the Libraries

In [1]:
# pip install -U scikit-learn

In [2]:
import jsonlines

# Word Processing
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

import numpy as np
import pandas as pd

import joblib
from sklearn.model_selection import cross_val_score

In [3]:
data = []
 
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

In [4]:
len(data)

606475

In [5]:
df = pd.DataFrame(data)

In [6]:
data = df.copy()

In [7]:
# Create a dictionary of punctuation
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))
# Create a dictionary of stopwords
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [8]:
data.txt = data.txt.apply(lambda string: cleanText(string))

In [None]:
porter = PorterStemmer()
data['txt_stems'] = data.txt.apply(lambda words: [porter.stem(word) for word in words])

In [None]:
# Join tokenized stem words into a string
data['txt_str'] = data.txt_stems.apply(lambda s: ' '.join(map(str, s)))

In [None]:
data.head()

In [None]:
cat_group = data.groupby('cat', as_index=False, group_keys=False)

In [None]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

## 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [None]:
def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    """
    terrible : 0.0 < y <= 3.0
    okay     : 3.0 < y <= 5.0
    great    : 5.0 < y <= 7.0
    amazing  : 7.0 < y <= 10.1
    """
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])

In [None]:
def train_model(path, model, continuous=True, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data.
    Returns the scores.
    """
    # load the corpus data and labels for classification
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    if continuous:
        y = continuous(corpus)
        scoring = 'r2_score'
    else:
        y = make_categorical(corpus)
        scoring = 'f1_score'

    # compute cross-validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    # write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # fit the model on entire dataset
    model.fit(X, y)

    # return scores
    return scores

In [None]:
if __name__ == '__main__':
    from transformer import TextNormalizer
    from reader import PickledReviewsReader

    from sklearn.pipeline import Pipeline
    from sklearn.neural_network import MLPRegressor, MLPClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer

    # path to postpreprocessed, part-of-speech tagged review corpus
    cpath = '../review_corpus_proc'

    regressor = Pipeline([
        ('norm', TextNormalizer()),
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPRegressor(hidden_layer_sizes=[500,150], verbose=True))
    ])
    regression_scores = train_model(cpath, regressor, continuous=True)

    classifier = Pipeline([
        ('norm', TextNormalizer()),
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPClassifier(hidden_layer_sizes=[500,150], verbose=True))
    ])
    classifer_scores = train_model(cpath, classifier, continuous=False)

## 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [None]:
from keras.models import Sequential
from keras import layers

In [None]:
N_FEATURES = 5000

# start neural network
nn = Sequential()

# Add fully connected layer with a ReLU activation function
nn.add(layers.Dense(units=500,
                         activation="relu",
                         input_shape=(N_FEATURES,)))

# Add fully connected layer with a ReLU activation function
nn.add(layers.Dense(units=150, activation="relu"))

# Add fully connected layer with a softmax activation function
nn.add(layers.Dense(units=3, activation="softmax"))

# Compile neural network
nn.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

In [None]:
# Train neural network
history = nn.fit(features_train, # features
                      target_train, # target
                      epochs=3, # three epochs
                      verbose=1, 
                      batch_size=50, # number of observations per batch
                      validation_data=(features_test, target_test)) # test data

In [None]:
from keras.layers import Dense
from keras.models import Sequential

N_FEATURES = 5000
N_CLASSES = 4

def build_network():
    """
    Create a function that returns a compiled neural network
    """
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

In [None]:
if __name__ == '__main__':
    from sklearn.pipeline import Pipeline
    from transformer import TextNormalizer
    from keras.wrappers.scikit_learn import KerasClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer

    pipeline = Pipeline([
        ('norm', TextNormalizer()),
        ('vect', TfidfVectorizer(max_features=N_FEATURES)),
        ('nn', KerasClassifier(build_fn=build_network,
                               epochs=200,
                               batch_size=128))
    ])

In [None]:
def train_model(path, model, saveto=None, cv=12):
    """
    Trains model from corpus at specified path and fits on full data.
    If a saveto dictionary is specified, writes Keras and Sklearn
    pipeline components to disk separately. Returns the scores.
    """
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    y = make_categorical(corpus)

    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    model.fit(X, y)

    if saveto:
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps.pop(-1)
        joblib.dump(model, saveto['sklearn_pipe'])

    return scores

In [None]:
cpath = '../review_corpus_proc'
mpath = {
    'keras_model'  : 'keras_nn.h5',
    'sklearn_pipe' : 'pipeline.pkl'
}
scores = train_model(cpath, pipeline, saveto=mpath, cv=12)

## 3. Classifying Images

In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [None]:
# pip install tensorflow

In [None]:
# pip install keras

In [None]:
# pip install intel-tensorflow

In [None]:
# Load libraries
from keras import backend as K
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten

In [None]:
# set that the color channel value will be first
K.set_image_data_format("channels_first")

In [None]:
# set seed
np.random.seed(11)

In [None]:
# set image information
channels=1
height=28
width=28

In [None]:
# load data and target from mnist data
(data_train, target_train), (data_test, target_test) = mnist.load_data()

In [None]:
# reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

In [None]:
# reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

In [None]:
# rescale pixel intensity to between 0 and 1
features_train = data_train/255
features_test = data_test/255

In [None]:
# one-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
numberofclasses = target_test.shape[1]

In [None]:
# start neural nerwork
network = Sequential()

In [None]:
# add convolutional layer with 64 filters, a 5x5 window and ReLU activation function
network.add(Conv2D(filters=64,
                  kernel_size=(5,5),
                  input_shape=(channels, width, height),
                  activation='relu'))

In [None]:
# add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2,2)))

In [None]:
# add dropout layer
network.add(Dropout(0.5))

In [None]:
# add layer to flatten input
network.add(Flatten())

In [None]:
# add a fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation="relu"))

In [None]:
# add a dropout layer
network.add(Dropout(0.5))

In [None]:
# add a fully connected layer with a softmax activation funtion
network.add(Dense(numberofclasses, activation="softmax"))

In [None]:
# compile a neural network
network.compile(loss="categorical_crossentropy", #cross-entropy
               optimizer="rmsprop", #root mean square propagation
               metrics=["accuracy"]) #accuracy performance metric

In [None]:
# train neural network
network.fit(features_train, # Features
            target_train, # Target
            epochs=2, # Number of epochs
            verbose=0, # Don't print description after each epoch
            batch_size=1000, # Number of observations per batch
            validation_data=(features_test, target_test)) # Data for evaluation


In [None]:
# report accuracy
from sklearn.metrics import classification_report

In [None]:
# get predictions
predictions = network.predict(features_test)
predictions = np.argmax(predictions, axis=1)
y_test = np.argmax(target_test, axis=1)

In [None]:
# print precision, recall, F1-score, and accuracy
print(classification_report(y_test,predictions))