In [1]:
import os
os.chdir('modules')
from train import train, score
from get_data import get_data
from model import LangID, LogisticRegression, ComplementNB, BernoulliNB
from get_gpt_reviews import get_gpt_reviews
import matplotlib.pyplot as plt
from random import shuffle
import numpy as np
Xt, Yt = get_data("dev")

Device used = cuda


In [None]:
clean = True
data_type = "gpt"
ns = [10, 50, 100, 500, 2000]


for n in ns:
    data_type = 'clean_' if clean else '' + data_type + f"_{n}"
    X_all, Y_all = get_data(data_type)
    aug_len = len(X_all) - n
    print("Augmented 'x' size of original:", aug_len/n, "for size", n)
    
    ps, scores = [], []
    for i in range(0, 101, 10):
        p = i/100
        data_size = int(n + p*aug_len) # exclusive to avoid indexing [-1:]
        X, Y = X_all[-data_size:], Y_all[-data_size:]
        model = LogisticRegression(max_iter=100)
        model.fit(X, Y)
        acc = (model.predict(Xt) == np.array(Yt)).mean()
        scores.append(acc)
        ps.append(p)
    plt.plot(ps, scores)
    plt.title('Size: ' + str(n))
    plt.show()

Augmented 'x' size of original: 199.0 for size 10


In [None]:
ps, scores = [], []
for i in range(20, 101, 10):
    p = i/100
    data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
    data = list(zip(X_all[-data_size:], Y_all[-data_size:])) # Go from back, so we always get base data
    shuffle(data)
    X, Y = zip(*data)
    model, vocab = train(X, Y, epochs=15, embed_dim=100, lstm_dim=100, min_df=0., max_df=1., batch_size=64)
    
    scores.append(score(model, vocab, Xt, Yt))
    ps.append(p)
plt.plot(ps, scores)
plt.show()

In [None]:
for min_df in [0]:
    ps, scores = [], []
    for i in range(0, 51, 10):
        p = i/100
        data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
        X = X_all[-data_size:] # Go from back, so we always get base data
        Y = Y_all[-data_size:]

        model, vocab = train(X, Y, epochs=20, embed_dim=100, lstm_dim=100, min_df=3, batch_size=512)
        scores.append(score(model, vocab, Xt, Yt))
        ps.append(p)
    plt.plot(ps, scores)
    plt.show()

In [None]:
class OnehotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def convert(self, sentence):# [[w1, w2, w3], [w1, w2, w3]]
        output = [0]*len(self.vocab)
        for word in sentence.split():
            word = word.lower()
            if word in self.vocab:
                output[self.vocab[word]] = 1
        return output

    def fit(self, X, y=None):
        vectorizer = TfidfVectorizer(min_df = 25)
        vectorizer.fit(X)
        self.vocab = vectorizer.vocabulary_
        return self
    
    def transform(self, X, y=None):
        X_ = [self.convert(row) for row in X]
        return X_

ps, scores = [], []
for i in range(0, 101, 5):
    p = i/100
    data_size = int(base_len + p*new_len - 1) # exclusive to avoid indexing [-1:]
    X = X_all[-data_size:] # Go from back, so we always get base data
    Y = Y_all[-data_size:]

    pipe = Pipeline([
        ('onehot', OnehotTransformer()),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X, Y)
    
    scores.append(pipe.score(Xt, Yt))
    ps.append(p)
plt.plot(ps, scores)

In [3]:
n = 10
X_all, Y_all = get_data("gpt_" + str(n))
ps, scores = [], []
model = LogisticRegression(max_iter=100)
model.fit(X_all, Y_all)
probs = model.predict_proba(X_all)

# Assume the following:
# p = 0.1 (of being positive)
# l = 1 (positive)

# p - l -> abs(0.01 - 1) > 0.99 
# Will catch wrong predictions
poor_idxs = sorted([(abs(p - l), i) for p, l, i in zip(probs[:,1], Y_all, range(len(probs))) if abs(p - l) > 0.5], reverse=True)
print(poor_idxs[:5])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[(0.9920405845147653, 4618), (0.9887748239701647, 10458), (0.9786618772489212, 9581), (0.9783390961017192, 6006), (0.9778834074586763, 6462)]


In [None]:
for del_p in range(0, 51, 10):
    del_size = int((del_p / 100)*len(poor_idxs))
    del_idxs = set(poor_idxs[:del_size])
    print('Deleting', len(del_idxs), 'constituting', len(del_idxs)/len(X_all), 'percent')
    
    X = [x for i, x in enumerate(X_all) if i not in del_idxs]
    Y = [y for i, y in enumerate(Y_all) if i not in del_idxs]
    
    new_len = len(X) - size
    ps, scores = [], []
    for i in range(0, 101, 20):
        p = i/100
        data_size = int(size + p*new_len) # exclusive to avoid indexing [-1:]
        x, y = X[-data_size:], Y[-data_size:]
        model = LogisticRegression(max_iter=100)
        model.fit(x, y)
        acc = (model.predict(Xt) == np.array(Yt)).mean()
        scores.append(acc)
        ps.append(p)
    plt.plot(ps, scores)
    plt.title('Percentage of poor labels deleted: ' + str(del_p))
    plt.show()

In [30]:
min_df = 1
max_df = 1.
n_gram_range = (1, 1)
for size in [10, 50, 100, 500, 2000]:
    Xt, Yt = get_data("dev")
    X, Y = get_data("n_" + str(size), early_return=False)
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())

    Xt, Yt = get_data("dev", cleanText=True)
    X, Y = get_data("n_" + str(size), early_return=False, cleanText=True)
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())
    
    print('\n')


Fitted vocab size: 186
0.5634847080630213
Fitted vocab size: 180
0.5619786839666358


Fitted vocab size: 986
0.705746061167748
Fitted vocab size: 943
0.7230074142724745


Fitted vocab size: 1358
0.6847775718257646
Fitted vocab size: 1291
0.7016913809082483


Fitted vocab size: 3957
0.7572984244670992
Fitted vocab size: 3711
0.7898517145505097


Fitted vocab size: 9620
0.8096617238183503
Fitted vocab size: 8966
0.8396663577386468




In [29]:
min_df = 5
max_df = .5
n_gram_range = (1, 1)
for n in [10, 50, 100, 500, 2000]:
    Xt, Yt = get_data("dev")
    X, Y = get_data("gpt_" + str(n), early_return=False)
    X, Y = X[-5000:], Y[-5000:]
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())

    Xt, Yt = get_data("dev", cleanText=True)
    X, Y = get_data("gpt_" + str(n), early_return=False, cleanText=True)
    X, Y = X[-5000:], Y[-5000:]
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())
    
    print('\n')


Fitted vocab size: 640
0.7260194624652456
Fitted vocab size: 590
0.754633920296571


Fitted vocab size: 1409
0.696014828544949
Fitted vocab size: 1280


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7634383688600556


Fitted vocab size: 1546


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7160565338276181
Fitted vocab size: 1436


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7311167747914736


Fitted vocab size: 1884
0.7524328081556997
Fitted vocab size: 1806


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7714318813716404


Fitted vocab size: 2448
0.7879981464318814
Fitted vocab size: 2300


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.821014828544949




In [4]:
min_df = 10
max_df = .5
n_gram_range = (1, 1)
for n in [10, 50, 100, 500, 2000]:
    Xt, Yt = get_data("dev")
    X, Y = get_data("gpt_" + str(n), early_return=False)
    X, Y = X[-5000:], Y[-5000:]
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())

    Xt, Yt = get_data("dev", cleanText=True)
    X, Y = get_data("gpt_" + str(n), early_return=False, cleanText=True)
    X, Y = X[-5000:], Y[-5000:]
    model = LogisticRegression(max_iter=100, ngram_range=n_gram_range, min_df=min_df, max_df=max_df, verbose_vocab=True)
    model.fit(X, Y)
    print((model.predict(Xt) == np.array(Yt)).mean())
    
    print('\n')


Fitted vocab size: 446
0.7170991658943466
Fitted vocab size: 406
0.7652919369786839


Fitted vocab size: 1089
0.7048192771084337
Fitted vocab size: 1039


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7371408711770158


Fitted vocab size: 1194


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6989110287303059
Fitted vocab size: 1154


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7336654309545876


Fitted vocab size: 1061
0.7439759036144579
Fitted vocab size: 1024


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7849860982391103


Fitted vocab size: 1384
0.7859128822984245
Fitted vocab size: 1311
0.8093141797961075


