In [5]:
import nltk
import numpy as np  
import pandas as pd  
#import statsmodels.api as sm  
from sklearn.preprocessing import MinMaxScaler  
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

In [6]:
from nltk.corpus import brown

nltk.download('popular')
nltk.download('brown')
nltk.download('universal_tagset')

tset = 'universal'

brown_tagged = brown.tagged_words(tagset=tset)

unique_tags = np.unique([tag for (word,tag) in brown_tagged])

brown_tagged_sents = brown.tagged_sents(tagset=tset)

train_ratio = 0.8

w2v_model = Word2Vec(brown.sents(), size=16, window=5, min_count=0, workers=4)

sents = len(brown_tagged_sents)

division = 3

train_count = int(sents*train_ratio)

test_count = int(sents*(1-train_ratio))

testing_sents = brown_tagged_sents[-division*test_count:-(division-1)*test_count-1]

training_sents = list(brown_tagged_sents[0:-division*test_count])
training_sents.extend(brown_tagged_sents[-(division-1)*test_count-1:])

len(training_sents), len(testing_sents)

def calc_cost(W,X,Y,C):

    n = X.shape[0]
    d_ind = 1 - Y * (np.dot(X, W))
    d_max = d_ind
    d_max[d_max < 0] = 0
    sample_cost = C * (np.sum(d_max) / n)
    reg_cost = 1 / 2 * np.dot(W, W) 
    total_cost = reg_cost + sample_cost
    return total_cost

def calc_grad(W,X,Y,C):
    
    if type(X) == np.float64:
        Y = np.array([Y])
        X = np.array([X])
    
    d_ind = 1 - (Y * np.dot(X, W))
    dw = np.zeros(len(W))

    for i, d in enumerate(d_ind):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (C * Y[i] * X[i])
        dw += di

    dw = dw/len(Y)

    return dw

class PoS_SVM:
    
    def __init__(self):
        pass    
    
    def fit(self,X,Y,C,u):
        
        max_iter = 5000
        w = np.zeros(X.shape[1])
        n = 0
        prev_cost = np.inf
        stop_cost = 0.05
        
        for i in range(max_iter):
            #for f,x in enumerate(X):
            #print(i)
            b_X, b_Y = shuffle(X,Y)
            ascent = calc_grad(w,b_X[-200:],b_Y[-200:],C)
            w = w - (u * ascent)
            
            if(i%100==0):
                cost = calc_cost(w,X,Y,C)
                if abs(prev_cost - cost) < stop_cost * prev_cost:
                    print(cost)
                    return w
                
                print(prev_cost,cost,w)
                prev_cost = cost
                
        
        print('Warning : did not converge')
        print(cost)
        return w
    
    
    def predict(self,W,X):
        Y_hat = np.array([])
        for i in range(X.shape[0]):
            yp = np.sign(np.dot(X[i], W))
            if(yp==0):
                yp = 1
            Y_hat = np.append(Y_hat, yp)
        return Y_hat
    
    def test(self,test_set):
        pass
    
    def predict_confidence(self,W,x):
        return (np.dot(W,np.array(x)))
        
        
        

model = PoS_SVM()

X = np.array([[-2,-3,1],[0, 0, 1], [1, 1, 1], [2,2, 1],[2, 3, 1],[3,4,1]])
Y = np.array([-1,-1, 1, 1, 1, 1])

params = model.fit(X,Y,50,0.01)
print(params)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/manojbhadu/nltk_data...
[nltk_data]    |   Package 

inf 10.67013888888889 [0.83333333 1.08333333 0.16666667]
10.67013888888889 1.5650080220272475 [ 0.96909077  1.06059886 -0.99550075]
1.5650080220272475 1.8281201851455702 [ 1.04801466  1.08150958 -0.97357651]
1.8281201851455702 1.6104574319890486 [ 1.03033824  1.04259846 -1.0355223 ]
1.6104574319890486 2.1591811096952442 [ 1.08060792  1.08509556 -0.93394186]
2.1591811096952442 1.5701275516702635 [ 1.02360379  1.02524641 -0.99718053]
1.5701275516702635 1.8252865348227023 [ 1.06902304  1.06962429 -0.97524632]
1.8252865348227023 1.6164515587463106 [ 1.03842014  1.03864022 -1.03721417]
1.6164515587463106 2.1500965165021797 [ 1.08356615  1.08364671 -0.93539004]
2.1500965165021797 1.5612735023386777 [ 1.0252554   1.02528488 -0.99862359]
1.5612735023386777 1.8169830414040369 [ 1.07060094  1.07061173 -0.97674798]
1.8169830414040369 1.625867532669227 [ 1.04226709  1.04227104 -1.03879038]
1.625867532669227 1.5406230967567152 [ 1.01008149  1.01008294 -1.02015392]
1.5776663931926684
[ 1.03806958  1

In [7]:
feature_set = pd.DataFrame()
l = []
for sent in training_sents:
    for word in sent:
        row = {}
        tag = word[1]
        word = word[0]
        row['word'] = word
        row['length'] = len(word)
        row['tag'] = tag
        for i in range(len(w2v_model.wv[word])):
            row[i] = w2v_model.wv[word][i]
        l.append(row)

In [11]:
feature_set = pd.DataFrame(l)
feature_set.head()

Unnamed: 0,word,length,tag,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,The,3,DET,-0.442013,-0.649926,0.848315,-1.424089,0.972041,-1.710908,-1.21874,5.574301,4.336178,0.259915,1.658852,1.882584,-2.843269,-1.145809,2.50559,3.191224
1,Fulton,6,NOUN,-0.368321,0.119321,0.009659,-0.025299,-0.094937,-0.11501,0.112363,0.245173,0.150018,0.083028,-0.10181,-0.142125,0.004395,-0.011211,0.300067,-0.078931
2,County,6,NOUN,-1.459616,1.336429,-0.410735,-0.035082,-0.858181,-0.695584,0.580988,1.7215,2.105061,1.091764,0.048439,0.091207,0.097051,0.262729,1.172697,-0.244341
3,Grand,5,ADJ,-0.685943,0.383769,-0.25929,-0.03107,-0.454937,-0.379391,0.102808,0.67272,0.764648,0.481699,0.044369,-0.076396,-0.254061,0.107715,0.590408,-0.236435
4,Jury,4,NOUN,-0.128603,0.04885,-0.076127,-0.009746,-0.023226,-0.080903,0.029005,0.127134,0.136093,0.079503,0.003444,-0.022383,0.011234,0.025005,0.133399,-0.027737


In [12]:
w_set = {}
for tag in unique_tags:
    print(tag)
    X = feature_set[[x for x in list(feature_set.columns) if ((x!='word')&(x!='tag'))]]
    X['intercept'] = 1
    Y = np.where(feature_set['tag']==tag,1,-1)
    params = model.fit(np.array(X),Y,5,0.01)
    w_set[tag] = params
    print(w_set)

.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['intercept'] = 1


inf 1.277087330699811 [-0.1885      0.00195699 -0.03323569  0.02158108 -0.02613038  0.06017326
 -0.00082385  0.04670515 -0.00586966 -0.08305947 -0.05073204 -0.02579005
 -0.0162064   0.05720434 -0.02204172 -0.02854406  0.00187078 -0.0365    ]
1.277087330699811 0.348697298264264 [-0.32963211 -0.06986388 -0.09908554 -0.16993597 -0.13235628  0.12432232
  0.01653304  0.20449933  0.0928154   0.0365006  -0.13811526 -0.17724018
  0.21360437  0.03818619  0.11556504 -0.10893915 -0.07490079 -0.00894265]
0.348697298264264 0.3793457076431319 [-0.3210102  -0.07498687 -0.11177606 -0.21005213 -0.12994911  0.11862478
  0.03702049  0.20641703  0.11874037  0.04956739 -0.13903638 -0.17221824
  0.22984212  0.0424923   0.10027755 -0.12722647 -0.08240259 -0.00346629]
0.3784084184873166
{'.': array([-0.31438754, -0.07373911, -0.11915824, -0.21507381, -0.12695785,
        0.12334503,  0.03917741,  0.20079763,  0.11271958,  0.03950703,
       -0.13336288, -0.17537151,  0.23991569,  0.03957844,  0.09904977,
    

0.45675052489190815 0.5065430219993907 [-0.16783247  0.23173715 -0.14058522  0.12491207 -0.0711126   0.11883797
  0.21167339  0.07578079  0.15200585 -0.06482158  0.30400641 -0.01204238
  0.03615716 -0.07419964 -0.24567254  0.08642422  0.14626252 -0.12739737]
0.5065430219993907 0.46857965122970247 [-0.17360813  0.21880051 -0.13574809  0.12912272 -0.07659614  0.12986227
  0.20797562  0.06247694  0.13736124 -0.08323164  0.28302126 -0.00618801
  0.02522106 -0.06509889 -0.24707161  0.07888435  0.14518165 -0.13233741]
0.47514204276394423
{'.': array([-0.31438754, -0.07373911, -0.11915824, -0.21507381, -0.12695785,
        0.12334503,  0.03917741,  0.20079763,  0.11271958,  0.03950703,
       -0.13336288, -0.17537151,  0.23991569,  0.03957844,  0.09904977,
       -0.13572225, -0.07621746, -0.00489734]), 'ADJ': array([-0.12325594,  0.08252131,  0.02500448,  0.05251214,  0.02885912,
        0.06438365,  0.04164942,  0.00651074, -0.04089177, -0.09859565,
       -0.0300196 , -0.0201406 , -0.02470

inf 0.43834142866160775 [-0.208       0.03422584 -0.04525726  0.0405514  -0.01793316  0.05479104
  0.01343328  0.04876504 -0.02654082 -0.11301871 -0.04839436 -0.02170964
 -0.0410558   0.06349368 -0.02236942 -0.02586129  0.01932256 -0.0465    ]
0.43834142866160775 0.3436855981473417 [-0.20836461  0.08203621 -0.0168355   0.0263691   0.04843951  0.03315932
  0.04325561  0.00407623 -0.08572713 -0.08253787 -0.03939689  0.00141095
 -0.0668228   0.02243647 -0.0091789   0.01355905  0.04342233 -0.08351407]
0.3410291001905791
{'.': array([-0.31438754, -0.07373911, -0.11915824, -0.21507381, -0.12695785,
        0.12334503,  0.03917741,  0.20079763,  0.11271958,  0.03950703,
       -0.13336288, -0.17537151,  0.23991569,  0.03957844,  0.09904977,
       -0.13572225, -0.07621746, -0.00489734]), 'ADJ': array([-0.12325594,  0.08252131,  0.02500448,  0.05251214,  0.02885912,
        0.06438365,  0.04164942,  0.00651074, -0.04089177, -0.09859565,
       -0.0300196 , -0.0201406 , -0.02470014,  0.06588839

In [None]:
test_feature_set = pd.DataFrame()
l_test = []
for sent in testing_sents:
    for word in sent:
        row = {}
        tag = word[1]
        word = word[0]
        row['word'] = word
        row['length'] = len(word)
        row['tag'] = tag
        for i in range(len(w2v_model.wv[word])):
            row[i] = w2v_model.wv[word][i]
        l_test.append(row)

In [None]:
test_feature_set = pd.DataFrame(l_test)
test_feature_set.head()

In [None]:
l_pred = []
for index, row in test_feature_set.iterrows():
    pred_row = {}
    max_confidence = -np.inf
    pred_tag = '.'
    pred_row['word'] = row['word']
    for tag in unique_tags:
        w = w_set[tag]
        x = []
        for key, val in row.iteritems():
          if key!='word' and key!= 'tag':
            x.append(val)
        x.append(1)
        x = np.array(x)
        confidence = model.predict_confidence(w,x)
        if(max_confidence<confidence):
            max_confidence = confidence
            pred_tag = tag
    pred_row['pred_tag'] = pred_tag
    pred_row['actual_tag'] = row['tag']
    l_pred.append(pred_row)

In [None]:
pred_df = pd.DataFrame(l_pred)

(pred_df['pred_tag']==pred_df['actual_tag']).value_counts(normalize = True)

True     0.716982
False    0.283018
dtype: float64

In [None]:
pred_df.to_pickle('SVM_accuracy_final_new_'+str(division)+'.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>