In [1]:
import numpy as np
import re
import collections
import operator
from scipy.optimize import minimize
import copy
import cvxopt
import scipy.io as sio

from scipy.optimize import fmin_bfgs

In [2]:
def readRT2k():
    data = sio.loadmat('data/rt2k/bigram_rt2k.mat')
    d1 = data['allSStr']
    d2 = data['labels']
    
    data = []
    for j in range(len(d1[0])):
        data.append( (' '.join((d1[0][j][i][0][0] if d1[0][j][i][0] else ' ') for i in range(len(d1[0][j]))), d2[0][j]) )
    return data

In [3]:
def readMPQA():
    data_samples = []
    for line in open('data/mpqa/mpqa_t4.dat', 'r'):
        data_samples.append(line.split())
    x1 = [' '.join(line[1:]) for line in data_samples]
    y1 = [int(line[0]) for line in data_samples]
    return zip(x1, y1)

In [4]:
d = readMPQA()

In [5]:
def findDic(text):
    words = re.compile('[a-zA-Z\']*[a-zA-Z]').findall(text)
    return collections.Counter(words)

In [6]:
def readData(directory, paths, limit=None):
    if limit:
        paths = paths[0:limit]

    data = []
    for path in paths:
        txt = open(directory + path).read()
        data.append(txt)
    return data

In [7]:
# data = sio.loadmat('data/mpqa/bigram_mpqa.mat')

In [8]:
# data_samples = []
# for line in open('data/mpqa/mpqa_t4.dat', 'r'):
#     data_samples.append(line.split())

In [9]:
data = readRT2k()

In [10]:
x1, y1 = zip(*data)

In [11]:
# x1 = [' '.join(line[1:]) for line in data_samples]
# y1 = [int(line[0]) for line in data_samples]

In [12]:
from sklearn.utils import shuffle

x, y = shuffle(x1, y1)

testNegData = [x[i] for i in xrange(200) if y[i] == 0]
testPosData = [x[i] for i in xrange(200) if y[i] == 1]

In [13]:
posData = [x[i] for i in xrange(200,len(x)) if y[i] == 1]
negData = [x[i] for i in xrange(200,len(x)) if y[i] == 0]

In [14]:
print len(posData)
print len(negData)
print len(testNegData)
print len(testPosData)
print negData[0]
print "+++++"
print posData[0]

904
896
104
96
  " gordy " is not a movie , it is a 90-minute-long " sesame street " skit , and a very bad one at that . this movie is so stupid and dumb that it's depressing to think that some hollywood executives actually gave this the green light , and even more surprising is the fact that this is a disney movie . i'm sure children are the target audience of this movie , but only kids under the age of five may be able to tolerate it . it is the story of a farm a piglet named gordy ( voiced by garms ) , whose family has been taken away to " up north , " which we know means death . of course we can hear the animals talk to each other , and they actually went to the trouble of attempting to sync the voices with their mouths but it comes out terrible . actually , it's almost funny in a way . the only remotely interesting and likable character soon appears , a little girl named jinnie sue macallister ( young ) who sees gordy on the back of a truck and essentially steals him . jinnie is a

In [15]:
def findWordCount(data):
    wordCount = []
    for review in data:
        wordCount.append(findDic(review))
    return wordCount

negDataDic = findWordCount(negData)
posDataDic = findWordCount(posData)

In [16]:
def updateDic(mainDic, dicArray):
    mainDic = copy.deepcopy(mainDic)
    for dic in dicArray:
        mainDic.update(dic)
    
    # update indexes
    for index, key in enumerate(mainDic):
        mainDic[ key ] = index
        
    return mainDic

allWords = {}
allWords = updateDic(allWords, negDataDic)
allWords = updateDic(allWords, posDataDic)

In [17]:
def findRow(words, text, boolean=False):
    row = np.zeros(len(allWords))
    wordsDic = findDic(text)
    for word in words:
        if word in allWords:
            row[allWords[word]] = 1 if boolean else wordsDic[word]
    return row

In [18]:
def createTable(allWords, negData, y0, posData, y1):
    x = []
    for data in negData + posData:
        x.append( findRow(allWords, data) )
    x = np.row_stack(x)
    
    y = np.append(y0 * np.ones(len(negData)), y1 * np.ones(len(posData)))
    return x, y

In [19]:
x, y = createTable(allWords, negData, -1, posData, 1)

In [20]:
M = len(x)
k = np.zeros( (M, M) )

In [21]:
print len(x)

1800


In [22]:
for i in xrange(M):
    if i%300 ==0:
        print i
    for j in xrange(M):
        k[i][j] = np.dot(x[i].T, x[j])

0
300
600
900
1200
1500


In [23]:
C = 0.004
P = cvxopt.matrix(np.outer(y, y) * k)
q = cvxopt.matrix(np.ones(M) * -1)

G = cvxopt.matrix(np.vstack([
    np.eye(M) * -1,
    np.eye(M)
    ]))

h = cvxopt.matrix(np.hstack([
    np.zeros(M),
    np.ones(M) * C
    ]))

A = cvxopt.matrix(y, (1,M))
b = cvxopt.matrix(0.0)

solution = cvxopt.solvers.qp(P, q, G, h, A, b)

     pcost       dcost       gap    pres   dres
 0: -1.9063e+00 -9.7264e+00  4e+03  7e+01  2e-12
 1: -1.8903e+00 -9.6225e+00  2e+02  3e+00  3e-12
 2: -1.7344e+00 -8.6487e+00  3e+01  4e-01  3e-12
 3: -1.3742e+00 -7.4841e+00  1e+01  1e-01  2e-12
 4: -1.1674e+00 -4.0470e+00  4e+00  2e-02  2e-12
 5: -1.2210e+00 -1.6787e+00  5e-01  2e-03  2e-12
 6: -1.2912e+00 -1.3765e+00  9e-02  3e-04  2e-12
 7: -1.3091e+00 -1.3230e+00  1e-02  3e-05  2e-12
 8: -1.3125e+00 -1.3138e+00  1e-03  2e-06  2e-12
 9: -1.3129e+00 -1.3129e+00  5e-05  5e-08  2e-12
10: -1.3129e+00 -1.3129e+00  1e-06  7e-10  2e-12
Optimal solution found.


In [24]:
a = solution['x']

In [25]:
w = np.zeros(len(x[0]))

In [26]:
for i in xrange(len(a)):
    w += a[i] * y[i] * x[i]

In [27]:
w

array([ 0.01465651,  0.0003248 , -0.00109493, ..., -0.004     ,
       -0.0017324 , -0.00259304])

In [28]:
w.dot( findRow(allWords, negData[1]) )

-0.73695294484761487

In [29]:
import random

negMax=None
for j in range(100):
    nMax = None
    for i in random.sample(negData,len(negData)/10):
        t = w.dot( findRow(allWords, i) )
        if not nMax or t > nMax:
            nMax = t
    if not negMax or nMax < negMax:
        negMax = nMax
print negMax

-0.380514205267


In [31]:
posMin=None
for j in range(100):
    pMin = None
    for i in random.sample(posData,len(posData)/10):
        t = w.dot( findRow(allWords, i) )
        if not pMin or t < pMin:
            pMin = t
    if not posMin or pMin > posMin:
        posMin = pMin
print posMin

0.943172708518


In [32]:
print posMin
print negMax
b = (negMax+posMin)/2

0.943172708518
-0.380514205267


In [33]:
print b

0.281329251625


In [34]:
def predict_with_b(x):
    return w.dot( findRow(allWords, x)) + b

In [35]:
def predict_without_b(x):
    return w.dot( findRow(allWords, x))

In [36]:
correct_p = 0
correct_n = 0
for i in testPosData:
    if predict_with_b(i) > 0:
        correct_p += 1
for i in testNegData:
    if predict_with_b(i) < 0:
        correct_n += 1

print correct_p*1. / len(testPosData)
print correct_n*1. / len(testNegData)
print (correct_n+correct_p)*1. / (len(testPosData) + len(testNegData))

0.96875
0.653846153846
0.805


In [37]:
correct_p = 0
correct_n = 0
for i in testPosData:
    if predict_without_b(i) > 0:
        correct_p += 1
for i in testNegData:
    if predict_without_b(i) < 0:
        correct_n += 1

print correct_p*1. / len(testPosData)
print correct_n*1. / len(testNegData)
print (correct_n+correct_p)*1. / (len(testPosData) + len(testNegData))

0.916666666667
0.778846153846
0.845
