In [1]:
import os
os.chdir('modules')

In [2]:

from model import LogisticRegression, BernoulliNB, ComplementNB, LogisticRegressionPytorch,OnehotTransformer
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
import time
import pandas as pd
from random import shuffle


X_dev, Y_dev = get_data("dev",cleanText = True)
probs_remove = [0,0.05,0.1,0.15,0.2,0.25]
sizes = [10, 50, 100, 500, 2000]

rows = len(probs_remove)*len(sizes)

data = {"percentage kept":[0]*rows,
       "size":[0]*rows,
       "score":[0.0]*rows,
       "vocab size base":[0]*rows,
        "vocab size final":[0]*rows,
       "time taken":[0]*rows}
df = pd.DataFrame(data)

z = 0
for size in sizes:
    
    for prob_remove in probs_remove:
        df.at[z,"size"] = size
        df.at[z,"percentage kept"] = 1-prob_remove
        tic = time.perf_counter()
        X_gpt_all, Y_gpt_all = get_data("clean_gpt_" + str(size))
        X_gpt, Y_gpt = X_gpt_all[:-size], Y_gpt_all[:-size]
        X_base, Y_base = X_gpt_all[-size:], Y_gpt_all[-size:]
        model = LogisticRegression(max_iter=100)
        model.fit(X_base, Y_base)
        df.at[z,"vocab size base"] = len(model[0].vocab)
        
        probs = [(probs[0], i) for i, probs in enumerate(model.predict_proba(X_gpt))]
        
        probs_neg = sorted([j for j in probs if Y_gpt[j[1]] == 0 ],key=lambda tup: tup[0],reverse=True)
        probs_pos = sorted([j for j in probs if Y_gpt[j[1]] == 1 ],key=lambda tup: tup[0])

        keep_neg = int((1-prob_remove)*len(probs_neg))
        keep_pos = int((1-prob_remove)*len(probs_pos))
        
        probs_neg = probs_neg[:keep_neg]
        probs_pos = probs_pos[:keep_pos]
        
#         pruned_probs = []
#         for n_prob, idx in probs:
#             if n_prob > 1-prob_remove:
#                 if Y_gpt[idx] == 0:
#                     pruned_probs.append((n_prob,idx))
#             elif n_prob < prob_remove:
#                 if Y_gpt[idx] == 1:
#                     pruned_probs.append((n_prob, idx))
#             else:
#                 pruned_probs.append((n_prob, idx))
#         sorted_probs = sorted(pruned_probs, key=lambda x:x[0])
        final_probs = probs_neg+probs_pos
        shuffle(final_probs)
        gpt_indices = [i[1] for i in final_probs]
        X_gpt_pruned = [X_gpt[i] for i in gpt_indices]
        Y_gpt_pruned = [Y_gpt[i] for i in gpt_indices]
        X_all = X_base + X_gpt_pruned
        Y_all = Y_base + Y_gpt_pruned
        
        transformer = OnehotTransformer(ngram_range=(1, 1), min_df=0.001, max_df=0.5, verbose_vocab=True)
        transformer.fit(X_all,Y_all)
        X_all = transformer.transform(X_all)
        
        df.at[z,"vocab size final"] = len(X_all[0])
        model = LogisticRegressionPytorch(input_dim=len(X_all[0]),epochs=30,progress_bar=True)
        model.train(X_all,Y_all,batch_size=64)

        acc = model.score(transformer.transform(X_dev),Y_dev)
        print("size",size,"prob_remove",prob_remove,"acc",acc)
        toc = time.perf_counter()
        print("Time taken:",toc-tic)
        df.at[z,"time taken"] = toc-tic
        z += 1
    print(f"Finished size {size}")


Fitted vocab size: 3885


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0 acc 0.853
Time taken: 5.2468021
Fitted vocab size: 3645


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.05 acc 0.846
Time taken: 3.6430374000000008
Fitted vocab size: 3524


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.1 acc 0.835
Time taken: 3.4877731999999995
Fitted vocab size: 3337


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.15 acc 0.817
Time taken: 2.9065099999999973
Fitted vocab size: 3205


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.2 acc 0.803
Time taken: 2.7112513000000007
Fitted vocab size: 3087


  0%|          | 0/30 [00:00<?, ?it/s]

size 10 prob_remove 0.25 acc 0.786
Time taken: 2.6301860999999995


In [3]:
df.to_csv("classifier_results.csv")

In [4]:
import os
#os.chdir('modules')
from model import LogisticRegression, BernoulliNB, ComplementNB
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

for size in [10, 50, 100, 500, 2000]:
    X_base, Y_base = get_data("n_" + str(size), early_return=False)
    X_dev, Y_dev = get_data("dev")
    LR = LogisticRegression(max_iter=100)
    BNB = BernoulliNB()
    CNB = ComplementNB()
    models = [LR,BNB,CNB]
    for model in models:
        model.fit(X_base, Y_base)
        acc = (model.predict(X_dev) == np.array(Y_dev)).mean()
        print(model,acc)

Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', LogisticRegression())]) 0.5634847080630213
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', BernoulliNB())]) 0.5405468025949953
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', ComplementNB())]) 0.5602409638554217
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                                   verbose_vocab=False)),
                ('clf', LogisticRegression())]) 0.705746061167748
Pipeline(steps=[('onehot',
                 OnehotTransformer(max_df=1.0, min_df=1, ngram_range=(1

In [None]:
import os
#os.chdir('modules')
from model import LogisticRegression, BernoulliNB, ComplementNB
from get_data import get_data
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
import time

for size in [10, 100, 500, 2000]:
    X_gpt_all, Y_gpt_all = get_data("gpt_" + str(size))
    X_gpt, Y_gpt = X_gpt_all[:-size], Y_gpt_all[:-size]
    X_base, Y_base = X_gpt_all[-size:], Y_gpt_all[-size:]
    X_dev, Y_dev = get_data("dev")
    ps, scores = [], []
    model = LogisticRegression(max_iter=100)
    model.fit(X_base, Y_base)
    probs = [(list(probs), i) for i, probs in enumerate(model.predict_proba(X_gpt))]
    pruned_probs = []
    for (n_prob, p_prob), idx in probs:
        if n_prob > 0.8:
            if Y_gpt[idx] == 0:
                pruned_probs.append((n_prob,idx))
        elif n_prob < 0.2:
            if Y_gpt[idx] == 1:
                pruned_probs.append((n_prob, idx))
        else:
            pruned_probs.append((n_prob, idx))
    sorted_probs = sorted(pruned_probs, key=lambda x:x[0])
    #final_probs = sorted_probs[:size*25]+sorted_probs[-size*25:]
    final_probs = sorted_probs
    gpt_indices = [i[1] for i in final_probs]
    X_gpt_pruned = [X_gpt[i] for i in gpt_indices]
    Y_gpt_pruned = [Y_gpt[i] for i in gpt_indices]
    X_all = X_base + X_gpt_pruned
    Y_all = Y_base + Y_gpt_pruned
    estimators = [
         ('lr', LogisticRegression(max_iter=100, ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True)),
         ('bnb', BernoulliNB(ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True)),
        ('cnb', ComplementNB(ngram_range=(1, 1), min_df=1, max_df=1., verbose_vocab=True))
    ]
    clf = StackingClassifier(
         estimators=estimators, final_estimator=LR()
    )

    clf.fit(X_all, Y_all)
    acc = (clf.predict(X_dev) == np.array(Y_dev)).mean()
    print(size,acc)


Fitted vocab size: 2533


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2533
Fitted vocab size: 2533
Fitted vocab size: 2209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2237


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2390


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2339


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2410


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 2209
Fitted vocab size: 2237
Fitted vocab size: 2390
Fitted vocab size: 2339
Fitted vocab size: 2410
Fitted vocab size: 2209
Fitted vocab size: 2237
Fitted vocab size: 2390
Fitted vocab size: 2339
Fitted vocab size: 2410
10 0.7515060240963856
Fitted vocab size: 5721


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5721
Fitted vocab size: 5721
Fitted vocab size: 5346


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5305


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5211


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5034


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitted vocab size: 5346
Fitted vocab size: 5387
Fitted vocab size: 5305
Fitted vocab size: 5211
Fitted vocab size: 5034
Fitted vocab size: 5346
Fitted vocab size: 5387
Fitted vocab size: 5305
Fitted vocab size: 5211
Fitted vocab size: 5034
100 0.7643651529193698
Fitted vocab size: 11599
