In [1]:
import re, nltk, csv, codecs, pickle
import pandas as pd
m = codecs.open('../data/master.csv', "r",encoding='utf-8', errors='ignore')
master = pd.read_csv(m)

train = master.sample(frac=0.8)
test = master.drop(train.index)

In [2]:
def processLine(line):
    
    #covert to lowercase
    line = line.lower()
    #convert www.* or https?://* to URL
    line = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', line)
    #convert @username to AT_USER
    line = re.sub('@[^\s]+', 'AT_USER', line)
    #remove addional white spaces
    line = re.sub('[\s]+', ' ', line)
    #replace #word with word
    line = re.sub(r'#([^\s]+)', r'\1', line)
    #trim
    line = line.strip('\'"')
    return line

In [3]:
trainTuples = [tuple(x) for x in master.values]
#trainTuples = [tuple(x) for x in train.values]
#testTuples = [tuple(x) for x in train.values]

In [4]:
#initialize stopWords
def getStopWordList(stopWordListFileName):
    stopwords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')
    
    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

stopWords = []
st = open('../data/stopwords.txt', 'r')
stopWords = getStopWordList('../data/stopwords.txt')

#replace repetitions
def replaceTwoOrMore(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

def getTrainFeatureVector(line):
    featureVector = []
    words = line.split()
    negFound = 0
    for w in words:
        #replace two or more with two occurences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word starts with a letter or number
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

def getFeatureVector(line):
    featureVector = []
    words = line.split()
    negFound = 0
    for w in words:
        #deal with negation
        if negFound == 1:
            if w == re.search('\?,.;', w):
                negFound == 0
            else:
                w = 'NOT_' + w
        if w == "not" or re.search(r"n't", w):
            negFound = 1
            
        #replace two or more with two occurences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word starts with a letter or number
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

In [5]:
#feature extraction
#read the lines one by one and process
featureList = []

lines = []
for t in trainTuples:
    line = t[0]
    sentiment = t[1]
    processedLine = processLine(line)
    print('processed line: ', processedLine)
    featureVector = getFeatureVector(processedLine)
    print('feature vector: ', featureVector)
    featureList.extend(featureVector)
    lines.append((featureVector, sentiment))

print(lines)

processed line:  good case, excellent value.
feature vector:  ['good', 'excellent', 'value']
processed line:  great for the jawbone.
feature vector:  ['great', 'jawbone']
processed line:  tied to charger for conversations lasting more than 45 minutes.major problems!!
feature vector:  ['tied', 'charger', 'conversations', 'lasting', 'more', '45', 'minutes.major', 'problems!!']
processed line:  the mic is great.
feature vector:  ['mic', 'great']
processed line:  i have to jiggle the plug to get it to line up right to get decent volume.
feature vector:  ['i', 'jiggle', 'plug', 'get', 'line', 'up', 'right', 'get', 'decent', 'volume']
processed line:  if you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.
feature vector:  ['you', 'several', 'dozen', 'several', 'hundred', 'contacts', 'imagine', 'fun', 'sending', 'each', 'one', 'one']
processed line:  if you are razr owner...you must have this!
feature vector:  ['you', 'razr', 'owner..yo

In [6]:
def extract_features(line):
    line_words = set(line)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in line_words)
    return features

In [7]:
#remove featureList duplicates
featureList = list(set(featureList))

In [8]:
#extract feature vector for all tweets at once
training_set = nltk.classify.util.apply_features(extract_features, lines)

In [9]:
#train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [10]:
#test the classifier
score = 0
for testLine in testTuples:
    processedTestLine = processLine(testLine[0])
    result = NBClassifier.classify(extract_features(getTrainFeatureVector(processedTestLine)))
    if(result == testLine[1]):
        score += 1
    else:
        print('Line: ', testLine)
        print('Classifier: ', result)
        print('Acutally: ', testLine[1])

NameError: name 'testTuples' is not defined

In [11]:
accuracy = score/len(testTuples)
print(accuracy)

0.9351184346035015


In [11]:
savedClassifier = open('savedNBClassifier.pkl', 'wb')
pickle.dump(NBClassifier, savedClassifier)
savedClassifier.close()

In [12]:
savedClassifier = open('savedNBClassifier.pkl', 'rb')
classer = pickle.load(savedClassifier)
savedClassifier.close()

In [20]:
tLine = "he was not sick"
processedTestLine = processLine(tLine)
result = classer.classify(extract_features(getFeatureVector(processedTestLine)))
print('Classifier: ', result)

Classifier:  0.0
