# POS tag using Logistic Regression

## Loading word embeddings
First we load the pretrained GloVe word embeddings trained on twitter data.

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import os.path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm

TRAIN_DATA_PATH = 'data/train.txt'

# create a sklearn model
vectorizer = DictVectorizer(sparse=False)

# Create a directory 'pretrained_embeds/' in the same directory as this notebook
# Download twitter embeddings from http://nlp.stanford.edu/data/glove.twitter.27B.zip
# Unzip it and place file 'glove.twitter.27B.25d.txt' in 'pretrained_embeds/' directory.

# We are doing it with 25 dimensional word embeddings, however we can try doing with more 
# dimensional embeddings available.

# If glove embeds is not in word2vec form then first convert it then load it
if os.path.isfile('pretrained_embeds/gensim_glove_vectors.txt'):
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)
else:
    glove2word2vec(glove_input_file="pretrained_embeds/glove.twitter.27B.50d.txt", word2vec_output_file="pretrained_embeds/gensim_glove_vectors.txt")
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)

def get_embed(word):
    # Case folding
    word = word.lower()
    try:
        return (glove_model.get_vector(word))
    except:
        return (glove_model.get_vector('unk'))


## Creating dataset

Loading data using nltk (we are using brown corpus) and splitting data in train and test.

In [30]:
# tagged_sents = brown.tagged_sents(tagset='universal')
with open('./traino.txt', 'r') as infile:
    data = infile.read()
    # Extracting the sentences from data and creating a sentences list []
    sentences = data.strip().split('\n\n')
    processed_sentences = []
    for sentence in sentences:
        sent = []
        # Split the sentence into individual lines (tokens and tags)
        lines = sentence.strip().split('\n')
        # Extract the tokens and tags from each line
        tokens_tags = [line.split() for line in lines]
        # Extract the tokens and tags into separate lists
        for token_tag in tokens_tags:
            token, tag = token_tag[0], token_tag[1]
            sent.append((token, tag))
        processed_sentences.append(sent)


# Splitting train and test(80:20)
tagged_sents = processed_sentences
train_len = int(len(tagged_sents) * 0.8)
train_sents = tagged_sents
test_sents = tagged_sents[train_len:]

brown_tags_words = []
brown_tags_words_test = []

train_tags = []
train_words = []
train_embeds = []

test_tags = []
test_words = []
test_embeds = []
# Create Train data
for sent in train_sents:
    brown_tags_words.extend([ (tag, word) for (word, tag) in sent ])

# # Seperate out tag and word sequences
for (tag, word) in brown_tags_words:
    train_tags.append(tag)
    train_words.append(word)
    # golve train_embeds
    train_embeds.append(get_embed(word))

# Create Test data
for sent in test_sents:
    brown_tags_words_test.extend([ (tag, word) for (word, tag) in sent ])

# Seperate out tag and word sequences
for (tag, word) in brown_tags_words_test:
    test_tags.append(tag)
    test_words.append(word)
    # golve test_embeds
    test_embeds.append(get_embed(word))

# # # Adding bias at the end of each embedding
train_embeds = np.asarray(train_embeds)
test_embeds = np.asarray(test_embeds)

# # Adding bias at the end of each embedding
temp = np.ones((train_embeds.shape[0], train_embeds.shape[1] + 1))
temp[:,:-1] = train_embeds
train_embeds = temp

# # Adding bias at the end of each embedding
temp = np.ones((test_embeds.shape[0], test_embeds.shape[1] + 1))
temp[:,:-1] = test_embeds
test_embeds = temp

In [None]:
lr = LogisticRegression(solver='newton-cg', verbose=1, n_jobs=-1)
lr.fit(train_embeds, train_tags)
filename = 'glovelr.pkl'
import joblib
joblib.dump(lr, filename)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


max_iter reached after 677 seconds




In [25]:
import joblib
lrn = joblib.load('glovelr.model')
predictions = lr.predict(test_embeds)
print('Accuracy:', accuracy_score(test_tags, predictions))
print(classification_report(test_tags, predictions))
# custom word2vec model trained on our training data
# Accuracy: Accuracy: 0.7612754550157551
# pretrained glove gensim model
# Accuracy: 0.843319381084513
# pretrained word2vec gensim model
# Accuracy: 0.7014532286130838

Accuracy: 0.843319381084513


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       1.00      1.00      1.00         9
           $       1.00      1.00      1.00       364
          ''       1.00      0.97      0.98       263
           (       0.00      0.00      0.00        62
           )       0.00      0.00      0.00        63
           ,       1.00      1.00      1.00      2166
           .       1.00      1.00      1.00      1773
           :       1.00      0.96      0.98       239
          CC       1.00      1.00      1.00      1067
          CD       0.65      1.00      0.79      1864
          DT       1.00      0.98      0.99      3510
          EX       0.90      1.00      0.95        43
          FW       0.00      0.00      0.00         3
          IN       0.93      0.98      0.95      4548
          JJ       0.73      0.62      0.67      2488
         JJR       0.79      0.85      0.82       172
         JJS       0.91      0.83      0.87        82
          MD       0.99    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
with open('./test_data.txt', 'r',encoding="utf8") as file:
    data = file.read()
    sentences = data.strip().split('\n\n')
    processed_sentences = []
    for sentence in sentences:
        sent = []
        # Split the sentence into individual lines (tokens and tags)
        lines = sentence.strip().split('\n')
        # append the lines to the processed_sentences list
        processed_sentences.append(lines)

test_embeds1 = []
test_words1 = []
for sent in processed_sentences:
    # test_tags.append(tag)
    for word in sent:
        test_words1.append(word)
    # golve test_embeds
        test_embeds1.append(get_embed(word))

# # Adding bias at the end of each embedding
test_embeds1 = np.asarray(test_embeds1)
temp = np.ones((test_embeds1.shape[0], test_embeds1.shape[1] + 1))
temp[:,:-1] = test_embeds1
test_embeds1 = temp

predictions = lr.predict(test_embeds1)
# print(len(predictions))

In [28]:
print("Writing to file...")
with open("FILENAME.txt", "w") as output:
    for i in range(len(predictions)):
        if test_words1[i] == '.':
            output.write(test_words1[i] + ' ' + predictions[i] + '\n')
            output.write('\n')
        else:
          output.write(test_words1[i] + ' ' + predictions[i] + '\n')
      # output.write('\n')
print("Done!")

Writing to file...
Done!
