In [None]:
### IMPORT ALL NECCESSARY MODULES AND PACKAGES

# TO PLOT DISTRIBUTIONS
import matplotlib.pyplot as plt

# NUMPY
import numpy as np

# ML MODELS
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# ML TEXT HELPERS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# ML METRICS
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score


In [None]:
# THIS FUNCTION FILTERS AND RETURNS ALL THE REVIEWS' DOCS AND LABELS
def read_documents(fileName):
    all_docs = []
    all_labels = []

    with open(fileName, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split(" ", 3)
            all_docs.append(words[3])
            all_labels.append(words[1])

    return all_docs, all_labels

In [None]:
# RETRIEVE ALL THE DATA FROM THE PASSED TEXT FILE
textFileData = 'all_sentiment_shuffled.txt'
all_docs, all_labels = read_documents(textFileData)

# FROM THE RETRIEVED DATA, SPLIT THE DATA INTO TRAINING AND EVALUTATION SETS
split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]
eval_docs = all_docs[split_point:]
eval_labels = all_labels[split_point:]

# FIND ALL UNIQUE LABELS
targetTypes = sorted(list(set(all_labels)))

# CREATE A DICTIONARY MAPPING LABELS TO THEIR INDEX
labelToIndexDict = dict()
for i in range(len(targetTypes)):
    labelToIndexDict[targetTypes[i]] = i

# CONVERT LABELS INTO INDICES
def getTargets(labels):
    targets = []
    for sentiment in labels:
        targets.append(labelToIndexDict[sentiment])
    return targets

# RETRIEVE CONVERSION FOR TRAINING AND EVALUATION SETS
trainTargets = getTargets(train_labels)
evalTargets = getTargets(eval_labels)

In [None]:
# PLOT THE DISTRIBUTION OF THE NUMBER OF THE INSTANCES IN EACH CLASS
numberData = [all_labels.count(label) for label in targetTypes]
barGraph = plt.bar(targetTypes,numberData)

for i in range(0, len(barGraph), 2):
    barGraph[i].set_color('r')

plt.title('Number of instances in each class')

# ADD COUNT FOR EACH CLASS TO THE GRAPH
for i, v in enumerate(numberData):
    plt.text(plt.xticks()[0][i] - 0.10, v + 50, str(v))

plt.show()

In [None]:
# TRAINING FOR NAIVE BAYES MODEL 
def trainNaiveBayes(trainDocs, trainTargets):
    return MultinomialNB().fit(trainDocs, trainTargets)

# TRAINING FOR BASE DT MODEL 
def trainBaseDT(trainDocs, trainTargets):
    return DecisionTreeClassifier(criterion="entropy").fit(trainDocs, trainTargets)

# TRAINING FOR BEST DT MODEL 
def trainBestDT(trainDocs, trainTargets):
    return DecisionTreeClassifier(criterion="gini",
                                  splitter="best",
                                  max_depth=None,
                                  min_samples_split=4,
                                  min_samples_leaf=1).fit(trainDocs, trainTargets)

# GIVEN THE ML MODEL AND THE EVALUATION SET, MAKE THE PREDICTIONS
def classify(clf,evalDocs):
    return clf.predict(evalDocs)

# PLOTS THE CONFUSION MATRIX      
def plotConfusionMatrix(clf,prediction):
    plot_confusion_matrix(clf, eval_docs_tfidf, evalTargets, labels=[i for i in range(len(targetTypes))])
    return confusion_matrix(evalTargets, prediction, labels=[i for i in range(len(targetTypes))])
        
# RETRIEVE OUTPUT CLASSIFICATION METRICS
def metrics(predicted):
    precision,recall,f1,support = precision_recall_fscore_support(evalTargets,
                                                                  predicted,
                                                                  labels=[i for i in range(len(targetTypes))],
                                                                  zero_division=0
                                                                 )
    return precision,recall,f1

# MEASURES THE PREDICTION ACCURACY OF THE MODEL
def accuracy(predicted):
    return accuracy_score(evalTargets, predicted)

In [None]:
# CONVERT EACH WORD OF THE VOCABULARY FOUND IN THE TRAINING 
# TO A CORRESPONDING INDEX AND COUNT EACH WORD
# NOTE: TO SEE THE VOCABULARY OF THE TRAINING SET: print(countVect.get_feature_names())
# NOTE: TO OUTPUT THE INDEX OF A WORD IN THE VOCAB (from countVect.get_feature_names()): countVect.vocabulary_.get('WORD') 
# UNDERSTANDING COUNTVECTORIZER()
# EX |(0, 23)     1|
# 0 is the index corresponding to the review, so 0 is the first review about a bad album
# 23 corresponds to the word at index 23 in countVect.get_feature_names()
# 1 is the number of times word 23 shows up in review 0
countVect = CountVectorizer()

### PREPARING TRAINING SET
train_docs_counts = countVect.fit_transform(train_docs)
tfidf_transformer = TfidfTransformer()
train_docs_tfidf = tfidf_transformer.fit_transform(train_docs_counts)
# TRAINING DONE

### PREPARING EVALUATION SET
eval_docs_counts = countVect.transform(eval_docs)
eval_docs_tfidf = tfidf_transformer.transform(eval_docs_counts)
# EVALUATION SET DONE


In [None]:
# NAIVES BAYES CLASSIFIER
nb = trainNaiveBayes(train_docs_tfidf, trainTargets)
nbPrediction = classify(nb,eval_docs_tfidf)
nbMetrics=[]

# CONFUSION MATRIX
nbMetrics.append(plotConfusionMatrix(nb,nbPrediction))

# RETRIEVE METRICS
nbPrecision, nbRecall, nbF1_measure = metrics(nbPrediction)
nbMetrics.append(nbPrecision)
nbMetrics.append(nbRecall)
nbMetrics.append(nbF1_measure)

# RETRIEVE ACCURACY OF PREDICTIONS
nbMetrics.append(accuracy(nbPrediction))

In [None]:
# BASE DT CLASSIFIER
baseDT = trainBaseDT(train_docs_tfidf, trainTargets)
baseDTPrediction = classify(baseDT,eval_docs_tfidf)
baseDTMetrics=[]

# CONFUSION MATRIX
baseDTMetrics.append(plotConfusionMatrix(baseDT,baseDTPrediction))

# RETRIEVE METRICS
baseDTPrecision, baseDTRecall, baseDTF1_measure = metrics(baseDTPrediction)
baseDTMetrics.append(baseDTPrecision)
baseDTMetrics.append(baseDTRecall)
baseDTMetrics.append(baseDTF1_measure)

# RETRIEVE ACCURACY OF PREDICTIONS
baseDTMetrics.append(accuracy(baseDTPrediction))

In [None]:
# BEST DT CLASSIFIER
bestDT = trainBestDT(train_docs_tfidf, trainTargets)
bestDTPrediction = classify(bestDT,eval_docs_tfidf)
bestDTMetrics=[]

# CONFUSION MATRIX
bestDTMetrics.append(plotConfusionMatrix(bestDT,bestDTPrediction))

# RETRIEVE METRICS
bestDTPrecision, bestDTRecall, bestDTF1_measure = metrics(bestDTPrediction)
bestDTMetrics.append(bestDTPrecision)
bestDTMetrics.append(bestDTRecall)
bestDTMetrics.append(bestDTF1_measure)

# RETRIEVE ACCURACY OF PREDICTIONS
bestDTMetrics.append(accuracy(bestDTPrediction))

In [None]:
def modelToFile(modelType,modelMetrics,prediction):
    
    #CREATE OUTPUT FILE
    file = open(modelType+"-"+textFileData,"w")
    
    file.write("LEGEND\n-----------------------\n")
    for index in range(len(targetTypes)):
        file.write("%s: %s\n" % (index, targetTypes[index]))
    file.write("\n\n")
    
    # WRITE CONFUSION MATRIX
    file.write("CONFUSION MATRIX\n")
    file.write("(row is true label, column is predicted label)\n-----------------------\n")
    file.write(np.array2string(modelMetrics[0]))
    file.write("\n\n\n")
    
    # WRITE PRECISION VALUES
    file.write("PRECISION VALUES\n-----------------------\n")
    for index in range(len(targetTypes)):
        file.write(targetTypes[index] +": " + str(modelMetrics[1][index].item()) +"\n")
    file.write("\n\n")
    
    # WRITE RECALL VALUES
    file.write("RECALL VALUES\n-----------------------\n")
    for index in range(len(targetTypes)):
        file.write(targetTypes[index] +": " + str(modelMetrics[2][index].item()) +"\n")
    file.write("\n\n")
    
    # WRITE F1-MEASURE VALUES
    file.write("F1-MEASURE VALUES\n-----------------------\n")
    for index in range(len(targetTypes)):
        file.write(targetTypes[index] +": " + str(modelMetrics[3][index].item()) +"\n")
    file.write("\n\n")
    
    # WRITE ACCURACY
    file.write("PREDICTION ACCURACY\n-----------------------\n")
    file.write("The model's accuracy is: " + str(modelMetrics[4].item()) +"\n")
    file.write("\n\n")
    
    # WRITE REVIEW LINE NUMBER AND THE PREDICTION OF THE MODEL
    file.write("BELOW ARE ALL THE PREDICTIONS MADE FOR EACH INSTANCES IN THE EVALUATION SET\n(0-indexed)\n--------------\n")
    index = split_point;
    for pred in prediction:
        file.write('%i, %s%s\n' % (index, pred, ' [Misclassified]' if evalTargets[index-split_point] != pred else ''))
        index+=1
         
    file.close()
    
# OUTPUT TO FILES 
modelToFile("NaiveBayesClassifier",nbMetrics,nbPrediction)
modelToFile("BaseDT",baseDTMetrics,baseDTPrediction)
modelToFile("BestDT",bestDTMetrics,bestDTPrediction)

In [None]:
# FIND BEST PARAMETERS USING GRIDSEARCH FOR BEST_DT

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

dec_tree = DecisionTreeClassifier()

pipe = Pipeline(steps=[('dec_tree', dec_tree)])

X = train_docs_tfidf
y = trainTargets

parameters = {
    'dec_tree__criterion': ['gini', 'entropy'],
    'dec_tree__max_depth': [2,8,16,None],
    'dec_tree__splitter': ['random', 'best'],
    'dec_tree__min_samples_split': [2,4,8,16],
    'dec_tree__min_samples_leaf': [1,2,4,8],
}

clf_GS = GridSearchCV(pipe, parameters, n_jobs=-1)
clf_GS.fit(X, y)

In [None]:
# PRINT RESULTS OF GRIDSEARCH

print('Best criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best splitter:', clf_GS.best_estimator_.get_params()['dec_tree__splitter'])
print('Best min_samples_split:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_split'])
print('Best min_samples_leaf:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf'])
print();
print(clf_GS.best_estimator_.get_params()['dec_tree'])
print('Best estimator accuracy:', np.mean(accuracy(clf_GS.best_estimator_.predict(eval_docs_tfidf))))