In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
import re

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))

def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

# Question 1

In [3]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    """The ID, text and label are found by slicing each line. The ID is 
    the first element and the label is the second element. The text is
    the last element"""
    
    doc_id = reviewLine[0]
    review_text = reviewLine[-1]
    label = reviewLine[1]
    if label == "__label1__":
        label = "fake"
    else:
        label= "real"
    
    return (doc_id, review_text, label)


In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    """Each line of text is preprocessed into tokens by splitting on 
    non-alphanumeric characters. The whitespace tokens are then removed.
    The text is then normalised by making all tokens lowercase."""
    token_split = re.split(r'[\W]', text)
    token_list = []
    for token in token_split:
        if token != '':
            token_list.append(token)
    token_list = [token.lower() for token in token_list]
    return token_list


# Question 2

In [5]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    """A feature vector of the tokens is then created. For each instance
    a local dictionary is created with features as keys and weights the 
    number of occurrences of the token. A global feature dictionary is also
    created that contains all features found in the local dictionaries."""
    
    localDict = {}
    
    """In each token list the token is first assumed to be within the dictionary,
    and increases the weight by an increment of 1. This is done using the "try"
    clause. If there is no pre-existing feature within the dictonary, 
    an exception occurs and a feature is created with corresponding value
    of 1."""
    for token in tokens:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
        try:
            localDict[i] += 1
        except KeyError:
            localDict[i] =1
    return localDict


In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(dual=True, max_iter=20000))])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
from sklearn import metrics

def crossValidate(dataset, folds):
    """Cross validate splits the dataset into a number of folds, which 
    resamples the data to evaluate the model of the classifier. The data is 
    split into training and testing data by a number of folds, which then validate
    the model and assess different characteristics. The crossValidate function
    gives the averaged precision, recall, f-score and accuracy of the model."""

    # The data is first shuffled
    shuffle(dataset)
    
    # Initial values of each characteristic are set to 0
    precision = 0
    recall = 0
    f_score = 0
    accuracy = 0
    k_num = 1
    # The size of the test folds
    foldSize = int(len(dataset)/folds)
    
    # The loop of the cross-validate.
    # The dataset is split into testing and training data sets
    
    for i in range(0,len(dataset),foldSize):
        """Splits the data into testing and training sets"""
        
        testing_data = dataset[i:i+foldSize]
        print(f'Testing data length: {len(testing_data)}')
        training_data = dataset[0:i] + dataset[(i+foldSize):]
        print(f'Training data length: {len(training_data)}')
        
        print(f'K-fold number: {k_num}')
        k_num +=1
        
        # Create classifier using each training set 
        # The true labels for the corresponding testing data are found
        classifier = trainClassifier(training_data)
        true_labels = [t[1] for t in testing_data]

        # Prediction on the unseen test data using the classifer
        test_pred = predictLabels(testing_data, classifier)
        
        # The precision, recall and f-score of the results
        (p, r, f, s) = metrics.precision_recall_fscore_support(true_labels, test_pred, average='weighted')
        
        # The values found in each loop are totalled
        precision += p
        recall += r
        f_score += f
        
        # The accuracy of the classifer is found and summed
        a = metrics.accuracy_score(true_labels, test_pred)
        accuracy += a
#         continue # Replace by code that trains and tests on the 10 folds of data in the dataset
    
    # The mean of the characteristics is found and returned.
    precision /= folds
    recall /= folds 
    f_score /= folds
    accuracy /= folds
    cv_results = [precision, recall, f_score, accuracy]
    
    return cv_results


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER
# Use predict labels
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))


In [9]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
34913


In [10]:
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData, 10)

Testing data length: 1680
Training data length: 15120
K-fold number: 1
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 2
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 3
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 4
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 5
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 6
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 7
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 8
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 9
Training Classifier...
Testing data length: 1680
Training data length: 15120
K-fold number: 10
Training Classifier...


[0.6132936793808013,
 0.6127976190476191,
 0.6127133664427085,
 0.6127976190476191]

# Evaluate on test set

In [11]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    accuracy  = metrics.accuracy_score(testTrue, testPred)
    print("\nDone training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
    print("Accuracy: %0.6f" % accuracy)

({6: 2, 3952: 1, 31: 1, 201: 1, 18276: 1, 300: 1, 46: 1, 58: 1, 45: 1, 10: 2, 118: 1, 628: 1, 23: 1, 1447: 1, 1157: 1, 739: 1, 1634: 1, 1623: 1, 141: 1, 93: 1}, 'fake')
Training Classifier...

Done training!
Precision: 0.592891
Recall: 0.592857
F Score:0.592820
Accuracy: 0.592857


# Questions 4 and 5
Once you're happy with your functions for Questions 1 to 3, it's advisable you make a copy of this notebook to make a new notebook, and then within it adapt and improve all three functions in the ways asked for in questions 4 and 5.