In [19]:
import re, nltk, csv, codecs, pickle, tweetProcessor
import pandas as pd
m = codecs.open('data/master.csv', "r",encoding='utf-8', errors='ignore')
master = pd.read_csv(m)

train = master.sample(frac=0.8)
test = master.drop(train.index)

In [2]:
def processLine(line):
    
    #covert to lowercase
    line = line.lower()
    #convert www.* or https?://* to URL
    line = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', line)
    #convert @username to AT_USER
    line = re.sub('@[^\s]+', 'AT_USER', line)
    #remove addional white spaces
    line = re.sub('[\s]+', ' ', line)
    #replace #word with word
    line = re.sub(r'#([^\s]+)', r'\1', line)
    #trim
    line = line.strip('\'"')
    return line

In [3]:
trainTuples = [tuple(x) for x in train.values]
testTuples = [tuple(x) for x in train.values]

In [4]:
#initialize stopWords
stopWords = []

#replace repetitions
def replaceTwoOrMore(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

def getStopWordList(stopWordListFileName):
    stopwords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')
    
    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

def getFeatureVector(line):
    featureVector = []
    words = line.split()
    for w in words:
        #replace two or more with two occorences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word starts with a letter or number
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #deal with negation
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

st = open('data/stopwords.txt', 'r')
stopWords = getStopWordList('data/stopwords.txt')

In [5]:
#feature extraction
#read the lines one by one and process
featureList = []

lines = []
for t in trainTuples:
    line = t[0]
    sentiment = t[1]
    processedLine = processLine(line)
    featureVector = getFeatureVector(processedLine)
    featureList.extend(featureVector)
    lines.append((featureVector, sentiment))

print(lines)



In [6]:
def extract_features(line):
    line_words = set(line)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in line_words)
    return features

In [7]:
#remove featureList duplicates
featureList = list(set(featureList))

In [8]:
#extract feature vector for all tweets at once
training_set = nltk.classify.util.apply_features(extract_features, lines)

In [10]:
#train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [11]:
#test the classifier
score = 0
for testLine in testTuples:
    processedTestLine = processLine(testLine[0])
    result = NBClassifier.classify(extract_features(getFeatureVector(processedTestLine)))
    if(result == testLine[1]):
        score += 1
    else:
        print('Line: ', testLine)
        print('Classifier: ', result)
        print('Acutally: ', testLine[1])

Line:  ('predecessors the mummy and the mummy returns stand as intellectual masterpieces next to the scorpion king . ', 'neg')
Classifier:  pos
Acutally:  neg
Line:  ("by the end of the movie , you're definitely convinced that these women are spectacular . ", 'pos')
Classifier:  neg
Acutally:  pos
Line:  ('the plot is so amusingly contrived and outlandish in its coincidences that no one could ever mistake it for anything resembling reality', 'pos')
Classifier:  neg
Acutally:  pos
Line:  ("[scherfig] has made a movie that will leave you wondering about the characters' lives after the clever credits roll . ", 'pos')
Classifier:  neg
Acutally:  pos
Line:  ("a rude black comedy about the catalytic effect a holy fool has upon those around him in the cutthroat world of children's television . ", 'pos')
Classifier:  neg
Acutally:  pos
Line:  ("only about as sexy and dangerous as an actress in a role that reminds at every turn of elizabeth berkley's flopping dolphin-gasm . ", 'neg')
Classifier

In [12]:
accuracy = score/len(testTuples)
print(accuracy)

0.9369284876905041


In [42]:
tLine = 'i am boring'
processedTestLine = tweetProcessor.formatTweet(tLine)
result = NBClassifier.classify(tweetProcessor.extract_features(tweetProcessor.getFeatureVector(processedTestLine)))
print('Classifier: ', result)

formatting tweet
Classifier:  neg
