In [67]:
import csv
import nltk
import numpy as np
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

In [68]:
# These are required downloads for nltk stopwords, tokenizer and lemmatizer.
# If you are running this code more than once, you can make this "if False" so that you do not download this data again.
if True:
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     E:\Users\laoko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     E:\Users\laoko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     E:\Users\laoko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [69]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, encoding="utf8") as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            # Here and in splitData function, I have added the three other categories of features
            # to be parsed in for Q5.
            (Id, Text, Label, Verified_Purchase, Product_Category, Review_Title) = parseReview(line)
            rawData.append((Id, Text, Label, Verified_Purchase, Product_Category, Review_Title))
# Here I have fed in more features for review by unpacking more variables in the tuple that parseReview returns.
# I have also made sure to implement these changes below in the splitData function.

def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Id, Text, Label, Verified_Purchase, Rating, Product_title) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text, Verified_Purchase, Rating, Product_title)),Label))
    for (Id, Text, Label, Verified_Purchase, Rating, Product_title) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text, Verified_Purchase, Rating, Product_title)),Label))

# Additional features added variations and results
I tried all of the other features for each document in the reviews text file including rating,
verified purchase, product category, product ID, product title, review title. I tested these individually alongside the initial 3 features of the review that we were testing. The results I obtained were as follows:
1. (ID, Review Text, Label) + Verified Purchase resulted in an average accuracy score of 81.2% 
2. (ID, Review Text, Label) + Rating resulted in an average accuracy score of 66.6% 
3. (ID, Review Text, Label) + Product Category resulted in an average accuracy score of 65.7% 
4. (ID, Review Text, Label) + Product ID resulted in an average accuracy score of 65.3% 
5. (ID, Review Text, Label) + Product Title resulted in an average accuracy score of 65.9% 
6. (ID, Review Text, Label) + Review Title resulted in an average accuracy score of 65.6% 

When combining three additional features together, I found that the score with Verified Purchase, Rating, and Product Title gave the best average accuracy score of 82.5%. 

However, some other combinations I tried and recorded the results for included:
1. (ID, Review Text, Label) + Verified Purchase + Rating + Review Title resulted in an average accuracy score of 81.4%.
2. (ID, Review Text, Label) + Verified Purchase + Rating + Product ID resulted in an average accuracy score of 81.2%.
3. (ID, Review Text, Label) + Verified Purchase + Rating + Product Category resulted in an average accuracy score of 81.3%.
3. (ID, Review Text, Label) + Verified Purchase + Product Title + Review Title resulted in an average accuracy score of 82.5%. (This is the same as the score I obtained when using Verified Purchase, Rating, and Product Title however, I chose to keep the latter in the final code submission as I found I had less fluctuation in the results I was achieving when running Cross Validate multiple times.)

In [70]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Returns a tuple of 6 including: an integer (DOC_ID), a string containing the review (REVIEW_TEXT),
    # a string indicating the label (LABEL), a string indicating whether the purchase was verified for the review (VERIFIED_PURCHASE),
    # a string of a number between 1-5 indicating the rating (RATING), and a string indicating the product title (PRODUCT_TITLE).   
    
    # Here I am using an if condition to change __label1__ to fake and __label2__ to real,
    # for the label in the triple.
    if reviewLine[1] == "__label1__":
        reviewLine[1] = fakeLabel
    else:
        reviewLine[1] = realLabel
    return (int(reviewLine[0]), reviewLine[8], reviewLine[1], reviewLine[3], str(reviewLine[2]), reviewLine[6])

# PreProcessing variations and results
All tests were carried out over 5 runs of the cross validate function and recorded average accuracy. N.B. I carried these tests out before adding the extra features like rating etc so that I could see the effect of these changes in pre processing on the actual review text itself.
1. word tokenize alone yielded accuracy score of 65.11%.
2. word tokenize + lemmatization + removing all punctuation yielded 64.71% accuracy.
3. word tokenize + punctuation removed gave 64.5% accuracy.
4. word tokenize + lemmatization gave 64.9% accuracy.
5. word tokenize + stop word removal yielded 64.8% accuracy.
6. word tokenize + porter stemmer gave the best accuracy of 65.2% on average and therefore, I stuck with this as my final preprocessing function.

My methods of implementing lemmatization, stop word removal and punctuation removal will be left in a comment block under preProcess function.

In [71]:
# Input: a string of one review
def preProcess(text, Verified_Purchase, Rating, Product_title):
    # The code here is tokenizing a string to split off punctuation other than periods.
    # I have also used WordNetLemmatizer from NLTK to lemmatize for nouns and verbs. 
    # Also some simple normalisation by lowercasing all words in the list as well as removing all punctuation
    # of tokens by using list comprehension.
    
    # New text is the review text from parse review with verified purchase, product category, and 
    # review title added to it so that these can also become features of the text and will also have a
    # weight when they become feature vectors in toFeatureVector.
    new_text = text + " " + Verified_Purchase + " " + Rating +  " " + Product_title + " "
    tokens = word_tokenize(new_text)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [token.lower() for token in tokens] 
    return tokens

In [72]:
#     lemma = WordNetLemmatizer()
#     tokens = [lemma.lemmatize(t, pos = "v") for t in tokens] # Lemmatizing for verbs.
#     tokens = [lemma.lemmatize(t, pos = "n") for t in tokens] # Lemmatizing for nouns.
#     stop_words = set(stopwords.words('english')) 
#     tokens = [token for token in tokens if not token in stop_words]
#     tokens = [token for token in tokens if token.isalpha()]

In [73]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    featureVec = {}

    for w in tokens:
        try:
            featureVec[w] += 1.0/len(tokens)
        except KeyError:
            featureVec[w] = 1.0/len(tokens)
        try:
            featureDict[w] += 1.0/len(tokens)
        except KeyError:
            featureDict[w] = 1.0/len(tokens)
    
    # Using bigrams did not improve the results in any combonation of features.
    if False:
        # just get bigram binary presence or not
        for i in range(1, len(tokens)):
            bigram = tokens[i-1] + " " + tokens[i]
            try:
                featureVec[bigram] = 1 #+= 1.0/len(tokens)
            except KeyError:
                featureVec[bigram] = 1 #= 1.0/len(tokens)
            try:
                featureDict[bigram] += 1.0
            except KeyError:
                featureDict[bigram] = 1.0
                
    return featureVec

# SVM parameter tuning variations and results 
Here I tried changing the regularization parameter C of the svm from 1.0 (default) to 0.9, 0.8 and so on. In theory a lower C should produce a larger margin-seperating hyperplane however, I did not find this to be useful for this particular datasetand didn't get any better results than when it was left at default. My results were as follows:
1. C = 0.9 yielded accuracy score of 64.9%
2. C = 0.8 yielded accuracy score of 64.8%
3. C = 0.7 yielded accuracy score of 64.8%
4. C = 0.6 yielded accuracy score of 64.5%
5. C = 0.5 yielded accuracy score of 64.3%
6. C = 0.4 yielded accuracy score of 64.1%
7. C = 0.3 yielded accuracy score of 63.8%
8. C = 0.2 yielded accuracy score of 63.3%
9. C = 0.1 yielded accuracy score of 62.5%

I have also made the tolerance smaller to 1e-5 from 1e-4 (the default) which gave an average accuracy score of 65.2% too however, there was less fluctuation in the results I obtained when using a smaller tolerance compared to the default. This was the opposite for when I made the tolerance larger (1e-3) as the results were also around the 65.2% mark on average however, there was more fluctuation in between runs of the crossValidate function.

In [74]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(tol = 1e-5))]) 
    return SklearnClassifier(pipeline).train(trainData)

In [75]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset),int(foldSize)):
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = [x[1] for x in myTestData]
        y_pred = predictLabels(myTestData, classifier)
        print(len(myTestData))
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        
    print(zip(*results))
    avgResults = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avgResults

In [76]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [77]:
# Initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

fakeLabel = 'fake'
realLabel = 'real'

reviewPath = 'amazon_reviews.txt'

# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# We do the cross validation on the 80% (training data)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')

splitData(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

crossValidate(trainData, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
53255
Fold start on items 0 - 1680
Training Classifier...
1680
Fold start on items 1680 - 3360
Training Classifier...
1680
Fold start on items 3360 - 5040
Training Classifier...
1680
Fold start on items 5040 - 6720
Training Classifier...
1680
Fold start on items 6720 - 8400
Training Classifier...
1680
Fold start on items 8400 - 10080
Training Classifier...
1680
Fold start on items 10080 - 11760
Training Classifier...
1680
Fold start on items 11760 - 13440
Training Classifier...
1680
Fold start on items 13440 - 15120
Training Classifier...
1680
Fold start on items 15120 - 16800
Training Classifier...
1680
<zip object at 0x0000017DF0D94180>


[0.8273395937613269, 0.8239285714285713, 0.8234925000491996]

# Evaluate on test set

In [78]:
functions_complete = True
if functions_complete:
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

Training Classifier...
Done training!
Precision: 0.821190
Recall: 0.816190
F Score:0.815472
