In [1]:
import numpy as np
import random
from nltk import ngrams
import re

Bag of words implementation

In [2]:
def createBagsOfWords(posSentences, negSentences):
    "posSentences, negSentences : array of tweets (tweet = array of words)"
    pos = {}
    neg = {}
    for sentence in posSentences:
        for word in sentence:
            pos[word] = pos.get(word, 0) + 1
    for sentence in negSentences:
        for word in sentence:
            neg[word] = neg.get(word, 0) + 1
    return (pos, neg)
                
def likelihoodFromBags(posBag, negBag):
    posWords = set(posBag.keys()) #set for union later on
    negWords = set(negBag.keys()) 
    allWords = posWords.union(negWords)
    
    posLikelihood = {}
    negLikelihood = {}

    
    for word in allWords:
        posCount = posBag.get(word, 0) + 1 # +1 for laplace smoothing, not necessary
        negCount = negBag.get(word, 0) + 1
        total = posCount + negCount
        posLikelihood[word] = posCount / total
        negLikelihood[word] = negCount / total
        
    return (posLikelihood, negLikelihood)

def likelihoodFromSentences(posSentences, negSentences):
    (posBag, negBag) = createBagsOfWords(posSentences, negSentences)
    return likelihoodFromBags(posBag, negBag)

def estimateForNGram(ngram, posLiks, negLiks):
    posLikelihood = 1.0
    negLikelihood = 1.0
    for word in ngram:
        posLikelihood *= posLiks.get(word, 1.0) #if absent in one, will be absent in other => ignore word by * 1
        negLikelihood *= negLiks.get(word, 1.0)
    if posLikelihood >= negLikelihood: #all words unknown or equality => positive by default, let's be optimistic
        return 1
    else:
        return -1
    
def predictFromScratch(train_pos, train_neg, test):
    (posLiks, negLiks) = likelihoodFromSentences(train_pos, train_neg)
    return [estimateForNGram(x, posLiks, negLiks) for x in test]

File interaction and data preparation

In [3]:
n_grams_length = 2

In [4]:
def string2ngrams(text, n):
    n_grams = ngrams(text.split(), n)
    return [' '.join(grams) for grams in n_grams]

def readFileOfTweets(path, isTestData=False):
    ret = None
    with open(path, "r") as f:
        ret = f.read().splitlines()
    return ret

def fileOfTweetsToNgramSentences(path, isTestData=False):
    strings = readFileOfTweets(path, isTestData)
    return [string2ngrams(x, n_grams_length) for x in strings]

In [5]:
def crossValidate(posNgrams, negNgrams, fold):
    cv_len = len(posNgrams) // fold
    accuracy = 0.0
    for i in range(fold):
        posNgramTest, posNgramTrain = posNgrams[i*cv_len:(i+1)*cv_len], (posNgrams[:i*cv_len] + posNgrams[(i+1)*cv_len:])
        negNgramTest, negNgramTrain = negNgrams[i*cv_len:(i+1)*cv_len], (negNgrams[:i*cv_len] + negNgrams[(i+1)*cv_len:])
        
        (posLiks, negLiks) = likelihoodFromSentences(posNgramTrain, negNgramTrain)

        predictPositive = [estimateForNGram(x, posLiks, negLiks) for x in posNgramTest]
        tp = len([x for x in predictPositive if x == 1])
        predictNegative = [estimateForNGram(x, posLiks, negLiks) for x in negNgramTest]
        tn = len([x for x in predictNegative if x == -1])
        
        print(tp, tn)
        
        accuracy += ((tp + tn) / (2*cv_len))
        
    print("accuracy : ", accuracy / fold)

In [6]:
posNgrams = fileOfTweetsToNgramSentences("./data/train_pos_full.txt")
negNgrams = fileOfTweetsToNgramSentences("./data/train_neg_full.txt")

crossValidate(posNgrams, negNgrams, 5)

221863 191482
222696 191073
221098 190549
222130 191173
221092 191481
accuracy :  0.8258547999999999


In [7]:
def readTestFile():
    with open("data/test_data.txt", "r") as f:
        content = f.readlines()
    content = [re.split(",", x, maxsplit=1)[1] for x in content]
    return content

def publishResults(test_pred, file_name):
    with open('data/' + file_name, "w") as f:
        f.write("Id,Prediction\n")
        for pred, index in zip(test_pred, range(1, len(test_pred) +1)):
            f.write(str(index) + "," + str(pred) + "\n")

In [8]:
(posLiks, negLiks) = likelihoodFromSentences(posNgrams, negNgrams)
test_x = [string2ngrams(x, n_grams_length) for x in readTestFile()]
publishResults([estimateForNGram(x, posLiks, negLiks) for x in test_x], str(n_grams_length) + "-grams.csv")