- Model
  - simple FFN
  - high level pytorch utilities/layers
- Data
  - use embeddings
  
  - [classification of nouns/verbs/adj](https://huggingface.co/datasets/batterydata/pos_tagging), [penn trebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
  
  - [sentiment classification (pos/neg/neutral)](https://huggingface.co/datasets/syedkhalid076/Sentiment-Analysis)
  

- Exercise
  - first binary classification (show)
  - then extend to multi-class classification (exercise)




### Imports

In [3]:
import json
from tqdm import tqdm
import pandas as pd
import numpy as np

In [32]:
df_train = pd.read_csv('../sentiment-analysis-copy/train_data.csv', delimiter=',')
df_train['label'].value_counts()

label
2    128
0     90
1     22
Name: count, dtype: int64

In [34]:
df_train = pd.read_csv('../Sentiment-Analysis/test_data.csv', delimiter=',')[ : 30]
df_train.to_csv('../sentiment-analysis-copy/test_data.csv', index=False)

### Read Penn Treebank Annotations

In [2]:
penn_rules = open('../pos_tagging/penn_treebank.txt', 'rt', encoding='utf8').read().split('\n')
penn_treebank = {}
idx2label = {0: 'noun', 1: 'verb', 2: 'adjective'}

for i in range(1, len(penn_rules)):
    pos = penn_rules[i].split('\t')[1]
    is_verb, is_noun, is_adjective = 'VB' in pos, 'NN' in pos, 'JJ' in pos
    if is_noun:
        penn_treebank[pos] = 0
    elif is_verb:
        penn_treebank[pos] = 1
    elif is_adjective:
        penn_treebank[pos] = 2

print(penn_treebank)

{'JJ': 2, 'JJR': 2, 'JJS': 2, 'NN': 0, 'NNS': 0, 'NNP': 0, 'NNPS': 0, 'VB': 1, 'VBD': 1, 'VBG': 1, 'VBN': 1, 'VBP': 1, 'VBZ': 1}


### Transform the data

In [4]:
with open('../pos_tagging/train.json', 'rt', encoding='utf8') as fr1, open('../pos_tagging/train_binary.csv', 'wt',
                                                                        encoding='utf8') as fw1:
    train_data = fr1.read().split('\n')
    print('word', 'class', file=fw1, sep=';')
    words_train = set()
    for line in tqdm(train_data, desc='Writing Train Data'):
        # evaluate string as code
        d = eval(line)
        if type(d) == tuple:
            d = d[0]
        for i in range(len(d['words'])):
            w, label = d['words'][i], penn_treebank.get(d['labels'][i], None)
            if label is not None and label != 2:
                if w.lower() not in words_train:
                    words_train.add(w.lower())
                    print(w.lower(), label, file=fw1, sep=';')


Writing Train Data: 100%|██████████| 13054/13054 [00:01<00:00, 7265.02it/s]


In [5]:
with open('../pos_tagging/test.json', 'rt', encoding='utf8') as fr2, open('../pos_tagging/test_binary.csv', 'wt',
                                                                       encoding='utf8') as fw2:
    test_data = fr2.read().split('\n')
    print('word', 'class', file=fw2, sep=';')
    words_test = set()
    for line in tqdm(test_data, desc='Writing Test Data'):
        # evaluate string as code
        d = eval(line)
        if type(d) == tuple:
            d = d[0]
        for i in range(len(d['words'])):
            w, label = d['words'][i], penn_treebank.get(d['labels'][i], None)
            if label is not None and label != 2:
                if w.lower() not in words_test:
                    words_test.add(w.lower())
                    print(w.lower(), label, file=fw2, sep=';')


Writing Test Data: 100%|██████████| 1451/1451 [00:00<00:00, 6367.51it/s]


### Labels

In [6]:
with open('../pos_tagging/labels.json', 'wt', encoding='utf8') as fw:
    json.dump(idx2label, fw, indent=4)

### Subsample Data

In [10]:
import pandas as pd

df_train = pd.read_csv('../pos_tagging/train_binary.csv', delimiter=';')
df_test = pd.read_csv('../pos_tagging/test_binary.csv', delimiter=';')

df_train['class'].value_counts()

class
0    11716
1     4124
Name: count, dtype: int64