In [None]:
import nltk
import numpy as np  # for handling multi-dimensional array operation
import pandas as pd  # for reading data from csv 
import statsmodels.api as sm  # for finding the p-value
from sklearn.preprocessing import MinMaxScaler  # for normalization
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

In [None]:
from nltk.corpus import brown

nltk.download('popular')
nltk.download('brown')
nltk.download('universal_tagset')

tset = 'universal'

brown_tagged = brown.tagged_words(tagset=tset)

unique_tags = np.unique([tag for (word,tag) in brown_tagged])

brown_tagged_sents = brown.tagged_sents(tagset=tset)

train_ratio = 0.8

w2v_model = Word2Vec(brown.sents(), size=16, window=5, min_count=0, workers=4)

sents = len(brown_tagged_sents)

division = 3

train_count = int(sents*train_ratio)

test_count = int(sents*(1-train_ratio))

testing_sents = brown_tagged_sents[-division*test_count:-(division-1)*test_count-1]

training_sents = list(brown_tagged_sents[0:-division*test_count])
training_sents.extend(brown_tagged_sents[-(division-1)*test_count-1:])

len(training_sents), len(testing_sents)

def calc_cost(W,X,Y,C):

    n = X.shape[0]
    d_ind = 1 - Y * (np.dot(X, W))
    d_max = d_ind
    d_max[d_max < 0] = 0
    sample_cost = C * (np.sum(d_max) / n)
    reg_cost = 1 / 2 * np.dot(W, W) 
    total_cost = reg_cost + sample_cost
    return total_cost

def calc_grad(W,X,Y,C):
    
    if type(X) == np.float64:
        Y = np.array([Y])
        X = np.array([X])
    
    d_ind = 1 - (Y * np.dot(X, W))
    dw = np.zeros(len(W))

    for i, d in enumerate(d_ind):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (C * Y[i] * X[i])
        dw += di

    dw = dw/len(Y)

    return dw

class PoS_SVM:
    
    def __init__(self):
        pass    
    
    def fit(self,X,Y,C,u):
        
        max_iter = 5000
        w = np.zeros(X.shape[1])
        n = 0
        prev_cost = np.inf
        stop_cost = 0.05
        
        for i in range(max_iter):
            #for f,x in enumerate(X):
            #print(i)
            b_X, b_Y = shuffle(X,Y)
            ascent = calc_grad(w,b_X[-200:],b_Y[-200:],C)
            w = w - (u * ascent)
            
            if(i%100==0):
                cost = calc_cost(w,X,Y,C)
                if abs(prev_cost - cost) < stop_cost * prev_cost:
                    print(cost)
                    return w
                
                print(prev_cost,cost,w)
                prev_cost = cost
                
        
        print('Warning : did not converge')
        print(cost)
        return w
    
    
    def predict(self,W,X):
        Y_hat = np.array([])
        for i in range(X.shape[0]):
            yp = np.sign(np.dot(X[i], W))
            if(yp==0):
                yp = 1
            Y_hat = np.append(Y_hat, yp)
        return Y_hat
    
    def test(self,test_set):
        pass
    
    def predict_confidence(self,W,x):
        return (np.dot(W,np.array(x)))
        
        
        

model = PoS_SVM()

X = np.array([[-2,-3,1],[0, 0, 1], [1, 1, 1], [2,2, 1],[2, 3, 1],[3,4,1]])
Y = np.array([-1,-1, 1, 1, 1, 1])

params = model.fit(X,Y,50,0.01)
print(params)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [None]:
feature_set = pd.DataFrame()
l = []
for sent in training_sents:
    for word in sent:
        row = {}
        tag = word[1]
        word = word[0]
        row['word'] = word
        row['length'] = len(word)
        row['tag'] = tag
        for i in range(len(w2v_model.wv[word])):
            row[i] = w2v_model.wv[word][i]
        l.append(row)

In [None]:
feature_set = pd.DataFrame(l)

In [None]:
feature_set

Unnamed: 0,word,length,tag,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,The,3,DET,-2.289777,-5.826371,1.526973,3.747920,2.435387,0.818528,2.849094,0.654558,-0.917720,-1.895288,-0.833723,-2.353878,2.452096,1.327866,1.101107,-0.408857
1,Fulton,6,NOUN,0.040491,0.132915,-0.056279,-0.016452,0.271780,-0.151642,0.251393,0.286276,-0.071573,-0.139619,-0.017404,-0.340492,0.167775,-0.079871,-0.089472,-0.124085
2,County,6,NOUN,1.020781,-0.052694,0.458675,1.042156,1.735041,-0.419741,1.478410,0.623497,-0.556083,-0.508752,-0.794552,-1.800663,1.566759,-0.392512,-0.890805,-0.157563
3,Grand,5,ADJ,0.334735,-0.049944,-0.022947,0.394494,0.790859,-0.218146,0.882423,0.326592,-0.233722,0.001406,-0.076867,-0.711644,0.578459,-0.132566,-0.300290,-0.043178
4,Jury,4,NOUN,0.060412,-0.026240,0.054046,0.081177,0.161740,-0.015574,0.139632,0.026019,-0.042451,0.018303,-0.045725,-0.083660,0.082292,0.001760,-0.071723,-0.057360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887950,boucle,6,NOUN,0.001331,-0.049422,-0.001470,0.061129,0.067151,-0.019182,0.074677,0.003895,-0.006227,0.039561,-0.009071,-0.049285,0.023504,-0.024106,-0.015518,-0.044374
887951,dress,5,NOUN,0.810674,0.087827,0.101039,0.521575,0.882458,-0.305475,1.186859,0.012198,-0.316733,0.136662,-0.526565,-0.527641,0.971616,0.317874,-0.372896,0.525584
887952,was,3,VERB,-0.145281,0.941213,-3.180400,-4.490648,1.394422,0.560418,4.922168,-3.253411,-3.931690,-0.695705,2.869465,-1.815448,3.548945,-0.807515,-1.698089,4.348125
887953,stupefying,10,VERB,0.027016,-0.037399,0.040278,0.004864,0.017863,0.011402,0.036062,0.018357,-0.018551,0.050266,-0.009724,-0.050810,0.065847,-0.005116,-0.001888,-0.012171


In [None]:
[x for x in list(feature_set.columns) if ((x!='word')&(x!='tag'))]

['length', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [None]:
w_set = {}
for tag in unique_tags:
    print(tag)
    X = feature_set[[x for x in list(feature_set.columns) if ((x!='word')&(x!='tag'))]]
    X['intercept'] = 1
    Y = np.where(feature_set['tag']==tag,1,-1)
    params = model.fit(np.array(X),Y,5,0.01)
    w_set[tag] = params
    print(w_set)

.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


inf 1.3142057305558528 [-0.20675    -0.00839652  0.01180966 -0.00112247 -0.07989287 -0.05135844
 -0.00036241 -0.10551912  0.02282393  0.01477248 -0.03934164  0.02133697
  0.01646004 -0.03220639  0.00391338  0.01425057 -0.05317713 -0.0405    ]
1.3142057305558528 0.351177881950185 [-0.3488173   0.17934533 -0.21437477  0.05295454 -0.14795509 -0.0294003
 -0.27532311 -0.07737076  0.0158082   0.07249974 -0.11427233 -0.05524066
 -0.12260979 -0.08150067  0.1187762   0.04781348  0.03334003 -0.01018612]
0.3593553963381284
{'.': array([-0.35813746,  0.1823527 , -0.22277199,  0.06619095, -0.14377454,
       -0.02814469, -0.29112155, -0.08063325,  0.00163661,  0.06455053,
       -0.12919061, -0.02499062, -0.11310142, -0.07697769,  0.11704842,
        0.04027705,  0.02424059, -0.00658701])}
ADJ
inf 0.9889877709025097 [-0.15425    -0.04074494  0.04165008  0.00053235 -0.07295747 -0.06035311
  0.01593883 -0.12023806  0.02377051  0.02088151 -0.03565695  0.00977702
  0.02663636 -0.04582805 -0.00193283  0

In [None]:
test_feature_set = pd.DataFrame()
l_test = []
for sent in testing_sents:
    for word in sent:
        row = {}
        tag = word[1]
        word = word[0]
        row['word'] = word
        row['length'] = len(word)
        row['tag'] = tag
        for i in range(len(w2v_model.wv[word])):
            row[i] = w2v_model.wv[word][i]
        l_test.append(row)

In [None]:
test_feature_set = pd.DataFrame(l_test)

In [None]:
l_pred = []
for index, row in test_feature_set.iterrows():
    pred_row = {}
    max_confidence = -np.inf
    pred_tag = '.'
    pred_row['word'] = row['word']
    for tag in unique_tags:
        w = w_set[tag]
        x = []
        for key, val in row.iteritems():
          if key!='word' and key!= 'tag':
            x.append(val)
        x.append(1)
        x = np.array(x)
        confidence = model.predict_confidence(w,x)
        if(max_confidence<confidence):
            max_confidence = confidence
            pred_tag = tag
    pred_row['pred_tag'] = pred_tag
    pred_row['actual_tag'] = row['tag']
    l_pred.append(pred_row)

In [None]:
test_feature_set

Unnamed: 0,word,length,tag,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,and,3,CONJ,0.669592,-0.889992,0.688166,1.453029,3.285247,-0.501344,3.759818,-1.162938,1.242030,2.076923,-2.630091,0.452036,0.859170,0.695636,-1.249868,1.200546
1,he,2,PRON,5.330790,-2.904027,1.195818,0.244097,0.555801,6.716421,3.356958,-1.818095,-3.822222,7.344110,0.820978,-0.931486,3.456266,0.560627,0.180584,3.677752
2,wrote,5,VERB,2.817215,0.219640,0.188020,-0.285824,2.215656,-0.365405,2.424895,-0.139178,-0.352255,-1.336796,1.453345,-0.351881,2.192610,-0.630303,-0.463801,0.446354
3,also,4,ADV,2.113750,-0.568901,-0.978480,3.567465,1.271312,-0.119157,3.215462,-0.234105,-1.792482,1.526168,0.715829,-1.039013,1.284121,-0.366648,2.413103,2.271127
4,the,3,DET,-0.978801,-2.223613,-0.531857,4.136570,0.245745,1.126452,5.197178,-1.574464,0.675887,-0.128982,-1.731676,-1.982141,0.921595,0.793983,-0.739911,1.600094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273232,on,2,ADP,0.258404,-1.917878,1.917808,2.073266,2.187052,-1.273290,6.730378,0.176215,-1.093815,1.673910,-2.784075,2.181083,-0.549316,0.950880,-1.293948,0.557077
273233,contractual,11,ADJ,0.037047,-0.002797,0.044363,0.093412,0.186411,-0.105444,0.242857,0.033384,-0.104092,-0.026564,-0.171131,-0.113479,0.198325,0.001693,0.031408,-0.093784
273234,terms,5,NOUN,1.261803,0.007509,-0.096703,2.215673,1.179754,-0.483024,1.262908,-0.106365,-0.221043,-0.548457,-0.872622,-1.487942,1.563703,0.363053,-0.793871,1.048577
273235,;,1,.,-0.658532,-0.645562,1.771374,-0.467588,0.903352,-5.072986,4.603155,-1.351380,1.243947,2.924744,-4.493494,-3.716317,3.150929,-1.458864,3.390068,1.476562


In [None]:
pred_df = pd.DataFrame(l_pred)

(pred_df['pred_tag']==pred_df['actual_tag']).value_counts(normalize = True)

True     0.716982
False    0.283018
dtype: float64

In [None]:
pred_df.to_pickle('SVM_accuracy_final_new_'+str(division)+'.pkl')

from google.colab import files

files.download('SVM_accuracy_final_new_'+str(division)+'.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>