In [None]:
import os
os.chdir('modules')
from train import train, score
from get_data import get_data
from model import LangID, LogisticRegression, ComplementNB, BernoulliNB
from get_gpt_reviews import get_gpt_reviews
import matplotlib.pyplot as plt
from random import shuffle
import numpy as np
Xt, Yt = get_data("dev")

In [None]:
import numpy as np
for size in [2000]:
    X_all, Y_all = get_data("gpt_" + str(size))
    assert len(get_data("n_" + str(size))) == size # Should be 2000 in the end
    new_len = len(X_all) - size
    print("Augmented 'x' size of original:", new_len/size, "for size", size)
    ps, scores = [], []
    for i in range(0, 101, 10):
        p = i/100
        data_size = int(size + p*new_len) # exclusive to avoid indexing [-1:]
        X, Y = X_all[-data_size:], Y_all[-data_size:]
        model = LogisticRegression(max_iter=100)
        model.fit(X, Y)
        acc = (model.predict(Xt) == np.array(Yt)).mean()
        scores.append(acc)
        ps.append(p)
    plt.plot(ps, scores)
    plt.show()

In [None]:
from get_data import get_data

In [None]:
ps, scores = [], []
for i in range(20, 101, 10):
    p = i/100
    data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
    data = list(zip(X_all[-data_size:], Y_all[-data_size:])) # Go from back, so we always get base data
    shuffle(data)
    X, Y = zip(*data)
    model, vocab = train(X, Y, epochs=15, embed_dim=100, lstm_dim=100, min_df=0., max_df=1., batch_size=64)
    
    scores.append(score(model, vocab, Xt, Yt))
    ps.append(p)
plt.plot(ps, scores)
plt.show()

In [None]:
for min_df in [0]:
    ps, scores = [], []
    for i in range(0, 51, 10):
        p = i/100
        data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
        X = X_all[-data_size:] # Go from back, so we always get base data
        Y = Y_all[-data_size:]

        model, vocab = train(X, Y, epochs=20, embed_dim=100, lstm_dim=100, min_df=3, batch_size=512)
        scores.append(score(model, vocab, Xt, Yt))
        ps.append(p)
    plt.plot(ps, scores)
    plt.show()

In [None]:
class OnehotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def convert(self, sentence):# [[w1, w2, w3], [w1, w2, w3]]
        output = [0]*len(self.vocab)
        for word in sentence.split():
            word = word.lower()
            if word in self.vocab:
                output[self.vocab[word]] = 1
        return output

    def fit(self, X, y=None):
        vectorizer = TfidfVectorizer(min_df = 25)
        vectorizer.fit(X)
        self.vocab = vectorizer.vocabulary_
        return self
    
    def transform(self, X, y=None):
        X_ = [self.convert(row) for row in X]
        return X_

ps, scores = [], []
for i in range(0, 101, 5):
    p = i/100
    data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
    X = X_all[-data_size:] # Go from back, so we always get base data
    Y = Y_all[-data_size:]

    pipe = Pipeline([
        ('onehot', OnehotTransformer()),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X, Y)
    
    scores.append(pipe.score(Xt, Yt))
    ps.append(p)
plt.plot(ps, scores)

In [None]:
get_data("gpt_100")

In [None]:
X, Y = get_data("gpt_" + str(100))
ps, scores = [], []
model = LogisticRegression(max_iter=100)
model.fit(X, Y)
probs = model.predict_proba(X)

In [None]:
# p = 0.1 (of being positive)
# l = 1 (positive)

# p - l -> abs(0.01 - 1) > 0.5 
# Will catch wrong predictions
#idxs = sorted((p, l, i) for p, l, i in zip(probs[:,0], Y, range(len(probs))) if abs(p - l) < 0.5)
idxs = sorted((p, i) for p, l, i in zip(probs[:,1], Y, range(len(probs))) if abs(p - l) > 0.5)

In [None]:
size = 10
X_all, Y_all = get_data("gpt_" + str(size))
for del_p in range(0, 51, 10):
    del_size = int((del_p / 100)*len(idxs))//2
    del_idxs = set(idxs[:del_size] + idxs[-del_size:])
    print(len(del_idxs))
    

    X_all = np.array([x for i, x in enumerate(X_all) if i not in del_idxs])
    Y_all = np.array([y for i, y in enumerate(Y_all) if i not in del_idxs])
    
    new_len = len(X_all) - size
    ps, scores = [], []
    for i in range(0, 101, 20):
        p = i/100
        data_size = int(size + p*new_len) # exclusive to avoid indexing [-1:]
        X, Y = X_all[-data_size:], Y_all[-data_size:]
        model = LogisticRegression(max_iter=100)
        model.fit(X, Y)
        acc = (model.predict(Xt) == np.array(Yt)).mean()
        scores.append(acc)
        ps.append(p)
    plt.plot(ps, scores)
    plt.title('Percentage of poor labels deleted: ' + str(del_p))
    plt.show()