In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
import re
import nltk

## Updating the loadData and splitData functions
The loadData and splitData functions were updated so the verified purchase, rating and category of each review were included as features within the dataset.

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label, Verified, Rating, Category) = parseReview(line)
            rawData.append((Id, Text, Label, Verified, Rating, Category))

def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label, Verified, Rating, Category) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text), Verified, Rating, Category),Label))
    for (_, Text, Label, Verified, Rating, Category) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text), Verified, Rating, Category),Label))
        

# Question 1

The parseReview function was also updated to extract the required information from the data.

In [3]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    """The ID, text and label are found by slicing each line. The ID is 
    the first element and the label is the second element. The text is
    the last element"""
    
    doc_id = reviewLine[0]
    review_text = reviewLine[-1]
    label = reviewLine[1]
    verified = reviewLine[3]
    rating = reviewLine[2]
    category = reviewLine[4]
    if label == "__label1__":
        label = "fake"
    else:
        label= "real"
    
    
    return (doc_id, review_text, label, verified, rating, category)


In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.util import ngrams
from nltk.stem.wordnet import WordNetLemmatizer

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    
    tokens = nltk.word_tokenize(text)
    token_norm = [token.lower() for token in tokens]
    
    punct = WordPunctTokenizer()
    punc_tokens = punct.tokenize(text)
    punc_norm = [token.lower() for token in punc_tokens]
    
#     stop_words = set(stopwords.words('english'))
#     filtered = [word for word in tokens if word not in stop_words]
#     filtered_norm = [word for word in token_norm if word not in stop_words]

#     filtered_punc = [word for word in punc_tokens if word not in stop_words]
#     filtered_punc_norm = [word for word in punc_norm if word not in stop_words]
    
#     filtered_alnum = [word for word in filtered if word.isalnum()]
#     filtered_punc_alnum = [word for word in filtered_punc if word.isalnum()]
#     filtered_punc_norm_alnum = [word for word in filtered_punc_norm if word.isalnum()]

#     bigrams = list(ngrams(token_norm, 2))
#     bigram_punc = list(ngrams(punc_norm, 2))
    
#     lmtzr = WordNetLemmatizer()
#     lm_tokens = [lmtzr.lemmatize(token) for token in tokens]
#     lm_norm = [lmtzr.lemmatize(token) for token in token_norm]
#     lm_punct = [lmtzr.lemmatize(token) for token in punc_tokens]
#     lm_punct_norm = [lmtzr.lemmatize(token) for token in punc_tokens]
    
#     lm_filtered_tokens = [lmtzr.lemmatize(token) for token in filtered]
#     lm_filtered_norm_tokens = [lmtzr.lemmatize(token) for token in filtered_norm]
#     lm_f_big = list(ngrams(lm_filtered_norm_tokens, 2))
    return punc_norm



# Question 2
The new features did not require preprocessing and underwent feature vectorisation.

In [5]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens, verified, rating, category):

    localDict = {}
    
    for token in tokens:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
        try:
            localDict[i] += (1.0/len(tokens))
        except KeyError:
            localDict[i] = (1.0/len(tokens))

# The purchase verification data was either 'Y' or 'N'. The features were added were set so 
# a verfied purchase had a value of 1, whilst a non-verified instance had a value of 0.

    if verified == 'Y':
        try:
            j = featureDict['VP']
        except KeyError:
            j = len(featureDict) +1
            featureDict['VP'] = 0
        localDict[j] = 0
    if verified == 'N':
        try:
            j = featureDict['VP']
        except KeyError:
            j = len(featureDict) +1
            featureDict['VP'] = j
        localDict[j] = 1.1

    # The rating was then included in the features
    try:
        k = featureDict['RATING']
    except KeyError:
        k = len(featureDict) +1
        featureDict['RATING'] = k
    localDict[k] = int(rating)

#     The last feature added was the category of the item.
    try:
        l = featureDict[category]
    except KeyError:
        l = len(featureDict) +1
        featureDict[category] = l
    localDict[l] = 1

    return localDict
# toFeatureVector(preProcess(dog[1]), dog[3], dog[4], dog[5])
# featureDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(dual=True, max_iter=20000))])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
from sklearn import metrics

def crossValidate(dataset, folds):
    """Cross validate splits the dataset into a number of folds, which 
    resamples the data to evaluate the model of the classifier. The data is 
    split into training and testing data by a number of folds, which then validate
    the model and assess different characteristics. The crossValidate function
    gives the averaged precision, recall, f-score and accuracy of the model."""

    # The data is first shuffled
    shuffle(dataset)
    
    # Initial values of each characteristic are set to 0
    precision = 0
    recall = 0
    f_score = 0
    accuracy = 0
    k_num = 1
    
    # The size of the test folds
    foldSize = int(len(dataset)/folds)
    
    # The loop of the cross-validate.
    # The dataset is split into testing and training data sets
    
    for i in range(0,len(dataset),foldSize):
        """Splits the data into testing and training sets"""
        print(f'K-fold number: {k_num}')
        k_num +=1
        
        testing_data = dataset[i:i+foldSize]
        print(f'Testing data length: {len(testing_data)}')
        training_data = dataset[0:i] + dataset[(i+foldSize):]
        print(f'Training data length: {len(training_data)}')
        
        # Create classifier using each training set 
        # The true labels for the corresponding testing data are found
        classifier = trainClassifier(training_data)
        true_labels = [t[1] for t in testing_data]

        # Prediction on the unseen test data using the classifer
        test_pred = predictLabels(testing_data, classifier)
        
        # The precision, recall and f-score of the results
        (p, r, f, s) = metrics.precision_recall_fscore_support(true_labels, test_pred, average='weighted')
        
        # The values found in each loop are totalled
        precision += p
        recall += r
        f_score += f
        
        # The accuracy of the classifer is found and summed
        a = metrics.accuracy_score(true_labels, test_pred)
        accuracy += a
        continue # Replace by code that trains and tests on the 10 folds of data in the dataset
    
    # The mean of the characteristics is found and returned.
    precision /= folds
    recall /= folds 
    f_score /= folds
    accuracy /= folds
    cv_results = [precision, recall, f_score, accuracy]
    
    return cv_results


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER
# Use predict labels
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))


In [9]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
35670


In [10]:
featureDict

{'when': 1,
 'least': 2,
 'you': 3,
 'think': 4,
 'so': 5,
 ',': 6,
 'this': 7,
 'product': 8,
 'will': 9,
 'save': 10,
 'the': 11,
 'day': 12,
 '.': 13,
 'just': 14,
 'keep': 15,
 'it': 16,
 'around': 17,
 'in': 18,
 'case': 19,
 'need': 20,
 'for': 21,
 'something': 22,
 'VP': 23,
 'RATING': 24,
 'PC': 25,
 'lithium': 26,
 'batteries': 27,
 'are': 28,
 'new': 29,
 'introduced': 30,
 'market': 31,
 'there': 32,
 'average': 33,
 'developing': 34,
 'cost': 35,
 'is': 36,
 'relatively': 37,
 'high': 38,
 'but': 39,
 'stallion': 40,
 'doesn': 41,
 "'": 42,
 't': 43,
 'compromise': 44,
 'on': 45,
 'quality': 46,
 'and': 47,
 'provides': 48,
 'us': 49,
 'with': 50,
 'best': 51,
 'at': 52,
 'a': 53,
 'low': 54,
 '.<': 55,
 'br': 56,
 '/>': 57,
 'many': 58,
 'built': 59,
 'technical': 60,
 'assistants': 61,
 'that': 62,
 'act': 63,
 'like': 64,
 'sensor': 65,
 'their': 66,
 'particular': 67,
 'forté': 68,
 'battery': 69,
 'keeps': 70,
 'my': 71,
 'phone': 72,
 'charged': 73,
 'up': 74,
 'work

In [11]:
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData, 10)

K-fold number: 1
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 2
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 3
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 4
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 5
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 6
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 7
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 8
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 9
Testing data length: 1680
Training data length: 15120
Training Classifier...
K-fold number: 10
Testing data length: 1680
Training data length: 15120
Training Classifier...


[0.7901154039425918,
 0.7886904761904762,
 0.7884678175004826,
 0.7886904761904762]

# Evaluate on test set

In [12]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    accuracy  = metrics.accuracy_score(testTrue, testPred)
    print("\nDone training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
    print("Accuracy: %0.6f" % accuracy)


({7: 0.08, 4068: 0.04, 36: 0.04, 216: 0.04, 18655: 0.04, 42: 0.04, 318: 0.04, 52: 0.04, 66: 0.04, 51: 0.04, 13: 0.08, 11: 0.08, 128: 0.04, 666: 0.04, 28: 0.04, 1510: 0.04, 1213: 0.04, 779: 0.04, 1704: 0.04, 1693: 0.04, 152: 0.04, 102: 0.04, 23: 1.1, 24: 5, 1582: 1}, 'fake')
Training Classifier...

Done training!
Precision: 0.802117
Recall: 0.798333
F Score:0.797700
Accuracy: 0.798333


# Effects of adding new features
Adding the new features made a substantial difference to the success of the algorithm. The results of the different combinations of the features are given below. The inclusion of the verified purchase feature makes the most signifcant difference, increasing the accuracy from 64% to 80+%. The feature combination that produces the best results are purchase verification and rating. The category feature lowers the algorithm effectiveness in all combinations.

There was an increase of roughly 15% in the recall, precision, F score and accuracy of the results compared with the improved classifer from question 4. These fe


VERIFIED PURCHASE, RATING & CATEGORY:

    Precision: 0.802117
    Recall: 0.798333
    F Score:0.797700
    Accuracy: 0.798333

VERIFIED PURCHASE & RATING:

    Precision: 0.813008
    Recall: 0.809286
    F Score:0.808717
    Accuracy: 0.809286
    
VERIFIED PURCHASE & CATEGORY:

    Precision: 0.806133
    Recall: 0.802619
    F Score:0.802051
    Accuracy: 0.802619
    
RATING & CATEGORY:

    Precision: 0.483024
    Recall: 0.483571
    F Score:0.479377
    Accuracy: 0.483571
    
VERIFIED PURCHASE ONLY:

    Precision: 0.813151
    Recall: 0.809524
    F Score:0.808971
    Accuracy: 0.809524

RATING ONLY:

    Precision: 0.591861
    Recall: 0.591429
    F Score:0.590948
    Accuracy: 0.591429
    
CATEGORY ONLY:

    Precision: 0.522724
    Recall: 0.522381
    F Score:0.520572
    Accuracy: 0.522381


In [13]:
labels = [t[1] for t in testData]
vp = testData[0]
vp[0].values()

dict_values([0.08, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.08, 0.08, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 1.1, 5, 1])

In [14]:
vp[1]

'fake'

In [15]:
fake = 0
fake_vps = 0
for i in range(len(testData)):
    # Number of fake
    if testData[i][1] == "fake":
        fake +=1
fake

2100