In [1]:
import os
os.chdir('modules')

In [11]:

from model import LogisticRegression, BernoulliNB, ComplementNB, LogisticRegressionPytorch,OnehotTransformer
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
import time
import pandas as pd
from random import shuffle


X_dev, Y_dev = get_data("dev",cleanText = True)
probs_remove = [0,0.05,0.1,0.15,0.2,0.25]
sizes = [10, 50, 100, 500, 2000]

rows = len(probs_remove)*len(sizes)

data = {"percentage kept":[0.0]*rows,
       "size":[0]*rows,
       "score":[0.0]*rows,
       "vocab size base":[0]*rows,
        "vocab size final":[0]*rows,
       "time taken":[0.0]*rows}
df = pd.DataFrame(data)

z = 0
big_tic = time.time()
for size in sizes:
    tic = time.perf_counter()
    X_gpt_all, Y_gpt_all = get_data("clean_gpt_" + str(size))
    X_gpt, Y_gpt = X_gpt_all[:-size], Y_gpt_all[:-size]
    X_base, Y_base = X_gpt_all[-size:], Y_gpt_all[-size:]
    model_base = LogisticRegression(max_iter=100)
    model_base.fit(X_base, Y_base)
    probs = [(probs[0], i) for i, probs in enumerate(model_base.predict_proba(X_gpt))]
    probs_neg = sorted([j for j in probs if Y_gpt[j[1]] == 0 ],key=lambda tup: tup[0],reverse=True)
    probs_pos = sorted([j for j in probs if Y_gpt[j[1]] == 1 ],key=lambda tup: tup[0])

    for prob_remove in probs_remove:
        df.at[z,"size"] = size
        df.at[z,"percentage kept"] = (1-prob_remove)
        
        
        df.at[z,"vocab size base"] = len(model_base[0].vocab)

        keep_neg = int((1-prob_remove)*len(probs_neg))
        keep_pos = int((1-prob_remove)*len(probs_pos))
        
        probs_neg = probs_neg[:keep_neg]
        probs_pos = probs_pos[:keep_pos]
        
        final_probs = probs_neg+probs_pos
        shuffle(final_probs)
        gpt_indices = [i[1] for i in final_probs]
        X_gpt_pruned = [X_gpt[i] for i in gpt_indices]
        Y_gpt_pruned = [Y_gpt[i] for i in gpt_indices]
        X_all = X_base + X_gpt_pruned
        Y_all = Y_base + Y_gpt_pruned
        
        transformer = OnehotTransformer(ngram_range=(1, 1), min_df=0.001, max_df=0.5, verbose_vocab=True)
        transformer.fit(X_all,Y_all)
        X_all = transformer.transform(X_all)
        
        df.at[z,"vocab size final"] = len(X_all[0])
        model = LogisticRegressionPytorch(input_dim=len(X_all[0]),epochs=30,progress_bar=True)
        model.train(X_all,Y_all,batch_size=64)

        acc = model.score(transformer.transform(X_dev),Y_dev)
        print("size",size,"prob_remove",prob_remove,"acc",acc)
        toc = time.perf_counter()
        print("Time taken:",toc-tic)
        df.at[z,"time taken"] = toc-tic
        df.at[z,"score"] = acc
        z += 1
    print(f"Finished size {size}")
print("total time elapsed:",time.time()-big_tic)

Fitted vocab size: 544


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0 acc 0.776
Time taken: 4.357951700000285
Fitted vocab size: 541


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.05 acc 0.786
Time taken: 8.680736000000252
Fitted vocab size: 541


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.1 acc 0.771
Time taken: 12.653993299999911
Fitted vocab size: 564


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.15 acc 0.724
Time taken: 16.364532600000075
Fitted vocab size: 541


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.2 acc 0.663
Time taken: 19.118676999999934
Fitted vocab size: 557


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.25 acc 0.641
Time taken: 21.702969700000267
Finished size 10
Fitted vocab size: 1214


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0 acc 0.814
Time taken: 88.51864780000005
Fitted vocab size: 1205


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0.05 acc 0.826
Time taken: 161.12651460000006
Fitted vocab size: 1185


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0.1 acc 0.801
Time taken: 230.84747490000018
Fitted vocab size: 1172


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0.15 acc 0.772
Time taken: 297.65128920000006
Fitted vocab size: 1169


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0.2 acc 0.75
Time taken: 351.1806753999999
Fitted vocab size: 1140


  0%|          | 0/30 [00:00<?, ?it/s]

size 50 prob_remove 0.25 acc 0.754
Time taken: 387.79562609999994
Finished size 50
Fitted vocab size: 1369


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0 acc 0.745
Time taken: 10.423033499999747
Fitted vocab size: 1352


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0.05 acc 0.762
Time taken: 19.41045099999974
Fitted vocab size: 1345


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0.1 acc 0.775
Time taken: 26.91087749999997
Fitted vocab size: 1328


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0.15 acc 0.753
Time taken: 33.52473439999994
Fitted vocab size: 1318


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0.2 acc 0.741
Time taken: 38.94242299999996
Fitted vocab size: 1265


  0%|          | 0/30 [00:00<?, ?it/s]

size 100 prob_remove 0.25 acc 0.738
Time taken: 43.592639399999825
Finished size 100
Fitted vocab size: 1381


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0 acc 0.81
Time taken: 129.43414510000002
Fitted vocab size: 1385


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0.05 acc 0.821
Time taken: 179.31988629999978
Fitted vocab size: 1407


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0.1 acc 0.818
Time taken: 218.15307209999992
Fitted vocab size: 1514


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0.15 acc 0.803
Time taken: 252.97415590000037
Fitted vocab size: 1826


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0.2 acc 0.803
Time taken: 286.62825239999984
Fitted vocab size: 2244


  0%|          | 0/30 [00:00<?, ?it/s]

size 500 prob_remove 0.25 acc 0.81
Time taken: 312.8849458000004
Finished size 500
Fitted vocab size: 1697


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0 acc 0.79
Time taken: 18.390212999999676
Fitted vocab size: 1738


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0.05 acc 0.814
Time taken: 25.936208899999656
Fitted vocab size: 1824


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0.1 acc 0.83
Time taken: 33.294170199999826
Fitted vocab size: 1896


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0.15 acc 0.835
Time taken: 39.2092646000001
Fitted vocab size: 1954


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0.2 acc 0.843
Time taken: 44.86072389999936
Fitted vocab size: 2199


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0.25 acc 0.848
Time taken: 49.549273199999334
Finished size 2000
total time elapsed: 815.5294308662415


In [12]:
df

Unnamed: 0,percentage kept,size,score,vocab size base,vocab size final,time taken
0,1.0,10,0.776,180,544,4.357952
1,0.95,10,0.786,180,541,8.680736
2,0.9,10,0.771,180,541,12.653993
3,0.85,10,0.724,180,564,16.364533
4,0.8,10,0.663,180,541,19.118677
5,0.75,10,0.641,180,557,21.70297
6,1.0,50,0.814,943,1214,88.518648
7,0.95,50,0.826,943,1205,161.126515
8,0.9,50,0.801,943,1185,230.847475
9,0.85,50,0.772,943,1172,297.651289


In [13]:
df.to_csv("classifier_results.csv")

In [6]:

from model import LogisticRegression, BernoulliNB, ComplementNB, LogisticRegressionPytorch,OnehotTransformer
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
import time
import pandas as pd
from random import shuffle


X_dev, Y_dev = get_data("dev",cleanText = True)
probs_remove = [0,1]
sizes = [2000]

rows = len(probs_remove)*len(sizes)

data = {"percentage kept":[0.0]*rows,
       "size":[0]*rows,
       "score":[0.0]*rows,
       "vocab size base":[0]*rows,
        "vocab size final":[0]*rows,
       "time taken":[0.0]*rows}
df = pd.DataFrame(data)

z = 0
for size in sizes:
    
    for prob_remove in probs_remove:
        df.at[z,"size"] = size
        df.at[z,"percentage kept"] = (1-prob_remove)
        tic = time.perf_counter()
        X_gpt_all, Y_gpt_all = get_data("clean_gpt_" + str(size))
        X_gpt, Y_gpt = X_gpt_all[:-size], Y_gpt_all[:-size]
        X_base, Y_base = X_gpt_all[-size:], Y_gpt_all[-size:]
        model = LogisticRegression(max_iter=100)
        model.fit(X_base, Y_base)
        df.at[z,"vocab size base"] = len(model[0].vocab)
        
        probs = [(probs[0], i) for i, probs in enumerate(model.predict_proba(X_gpt))]
        
        probs_neg = sorted([j for j in probs if Y_gpt[j[1]] == 0 ],key=lambda tup: tup[0],reverse=True)
        probs_pos = sorted([j for j in probs if Y_gpt[j[1]] == 1 ],key=lambda tup: tup[0])

        keep_neg = int((1-prob_remove)*len(probs_neg))
        keep_pos = int((1-prob_remove)*len(probs_pos))
        
        probs_neg = probs_neg[:keep_neg]
        probs_pos = probs_pos[:keep_pos]
        
#         pruned_probs = []
#         for n_prob, idx in probs:
#             if n_prob > 1-prob_remove:
#                 if Y_gpt[idx] == 0:
#                     pruned_probs.append((n_prob,idx))
#             elif n_prob < prob_remove:
#                 if Y_gpt[idx] == 1:
#                     pruned_probs.append((n_prob, idx))
#             else:
#                 pruned_probs.append((n_prob, idx))
#         sorted_probs = sorted(pruned_probs, key=lambda x:x[0])
        final_probs = probs_neg+probs_pos
        shuffle(final_probs)
        gpt_indices = [i[1] for i in final_probs]
        X_gpt_pruned = [X_gpt[i] for i in gpt_indices]
        Y_gpt_pruned = [Y_gpt[i] for i in gpt_indices]
        X_all = X_base + X_gpt_pruned
        Y_all = Y_base + Y_gpt_pruned
        
        transformer = OnehotTransformer(ngram_range=(1, 1), min_df=0.001, max_df=0.5, verbose_vocab=True)
        transformer.fit(X_all,Y_all)
        X_all = transformer.transform(X_all)
        
        df.at[z,"vocab size final"] = len(X_all[0])
        model = LogisticRegressionPytorch(input_dim=len(X_all[0]),epochs=30,progress_bar=True)
        model.train(X_all,Y_all,batch_size=64)

        acc = model.score(transformer.transform(X_dev),Y_dev)
        print("size",size,"prob_remove",prob_remove,"acc",acc)
        toc = time.perf_counter()
        print("Time taken:",toc-tic)
        df.at[z,"time taken"] = toc-tic
        df.at[z,"score"] = toc-tic
        z += 1
    print(f"Finished size {size}")


Fitted vocab size: 1697


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 0 acc 0.792
Time taken: 15.895366999999624
Fitted vocab size: 3885


  0%|          | 0/30 [00:00<?, ?it/s]

size 2000 prob_remove 1 acc 0.854
Time taken: 11.450990199999978
Finished size 2000


In [4]:
import os
#os.chdir('modules')
from model import LogisticRegression, BernoulliNB, ComplementNB
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

for size in [10, 50, 100, 500, 2000]:
    X_base, Y_base = get_data("n_" + str(size), early_return=False)
    X_dev, Y_dev = get_data("dev")
    LR = LogisticRegression(max_iter=100)
    BNB = BernoulliNB()
    CNB = ComplementNB()
    models = [LR,BNB,CNB]
    for model in models:
        model.fit(X_base, Y_base)
        acc = (model.predict(X_dev) == np.array(Y_dev)).mean()
        print(model,acc)

Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', LogisticRegression())]) 0.5634847080630213
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', BernoulliNB())]) 0.5405468025949953
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', ComplementNB())]) 0.5602409638554217
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', LogisticRegression())]) 0.705746061167748
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1

In [None]:
import os
#os.chdir('modules')
from model import LogisticRegression, BernoulliNB, ComplementNB
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
import time

for size in [10, 100, 500, 2000]:
    X_gpt_all, Y_gpt_all = get_data("gpt_" + str(size))
    X_gpt, Y_gpt = X_gpt_all[:-size], Y_gpt_all[:-size]
    X_base, Y_base = X_gpt_all[-size:], Y_gpt_all[-size:]
    X_dev, Y_dev = get_data("dev")
    ps, scores = [], []
    model = LogisticRegression(max_iter=100)
    model.fit(X_base, Y_base)
    probs = [(list(probs), i) for i, probs in enumerate(model.predict_proba(X_gpt))]
    pruned_probs = []
    for (n_prob, p_prob), idx in probs:
        if n_prob > 0.8:
            if Y_gpt[idx] == 0:
                pruned_probs.append((n_prob,idx))
        elif n_prob < 0.2:
            if Y_gpt[idx] == 1:
                pruned_probs.append((n_prob, idx))
        else:
            pruned_probs.append((n_prob, idx))
    sorted_probs = sorted(pruned_probs, key=lambda x:x[0])
    #final_probs = sorted_probs[:size*25]+sorted_probs[-size*25:]
    final_probs = sorted_probs
    gpt_indices = [i[1] for i in final_probs]
    X_gpt_pruned = [X_gpt[i] for i in gpt_indices]
    Y_gpt_pruned = [Y_gpt[i] for i in gpt_indices]
    X_all = X_base + X_gpt_pruned
    Y_all = Y_base + Y_gpt_pruned
    estimators = [
         ('lr', LogisticRegression(max_iter=100, ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True)),
         ('bnb', BernoulliNB(ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True)),
        ('cnb', ComplementNB(ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True))
    ]
    clf = StackingClassifier(
         estimators=estimators, final_estimator=LR()
    )

    clf.fit(X_all, Y_all)
    acc = (clf.predict(X_dev) == np.array(Y_dev)).mean()
    print(size,acc)


Fitted vocab size: 2533


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2533
Fitted vocab size: 2533
Fitted vocab size: 2209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2237


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2390


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2339


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2410


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2209
Fitted vocab size: 2237
Fitted vocab size: 2390
Fitted vocab size: 2339
Fitted vocab size: 2410
Fitted vocab size: 2209
Fitted vocab size: 2237
Fitted vocab size: 2390
Fitted vocab size: 2339
Fitted vocab size: 2410
10 0.7515060240963856
Fitted vocab size: 5721


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5721
Fitted vocab size: 5721
Fitted vocab size: 5346


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5305


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5211


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5034


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5346
Fitted vocab size: 5387
Fitted vocab size: 5305
Fitted vocab size: 5211
Fitted vocab size: 5034
Fitted vocab size: 5346
Fitted vocab size: 5387
Fitted vocab size: 5305
Fitted vocab size: 5211
Fitted vocab size: 5034
100 0.7643651529193698
Fitted vocab size: 11599
