In [1]:
import ml
import nlp
import json_io
import pickle
from itertools import chain
from dvs import DictVectorizerPartial
import numpy as np

### Process comments

In [2]:
path = ml.JSON_DIR+"twitter/"
sarcastic_path = path+"sarcastic/"
serious_path = path+"serious/"
source = '-twitter-'
features_path = 'features/'
n=1

In [14]:
json_io.processRandomizeJson(sarcastic=True,
                     json_path=sarcastic_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensTwitter)
json_io.processRandomizeJson(sarcastic=False,
                     json_path=serious_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensTwitter)

File unique.json	time:	0:05:26.405835
Processed 52679 json lines
File unique.json	time:	0:14:49.132615
Processed 163298 json lines


### Load set of features

In [3]:
sarcasticFeats = json_io.loadProcessedFeatures(features_path,
                                       source,
                                       sarcastic=True,
                                       n=n,
                                       random=False)
seriousFeats = json_io.loadProcessedFeatures(features_path,
                                     source,
                                     sarcastic=False,
                                     n=n,
                                     random=False,
                                     reduce=0)
features = chain(sarcasticFeats, seriousFeats)

### Train and test, reports results

In [4]:
dvp = DictVectorizerPartial()

In [5]:
(X,y) = ml.split_feat(features, 2)

In [6]:
(X,y) = ml.flatten(X,y)

In [7]:
(X,y) = (dvp.partial_fit_transform(X), np.array(list(y)))

In [8]:
pickle.dump(dvp, open('pickled/-twitter-dvp.pickle', 'wb'))
pickle.dump(y, open('pickled/-twitter-y.pickle', 'wb'))
pickle.dump(X, open('pickled/-twitter-X.pickle', 'wb'))

In [2]:
X = pickle.load(open('pickled/-twitter-X.pickle', 'rb'))
y = pickle.load(open('pickled/-twitter-y.pickle', 'rb'))

### Train and test, reports results

In [9]:
from sklearn.naive_bayes import MultinomialNB

results = []
for reduceamount in [0, 1000000, 500000, 100000, 50000, 25000, 10000, 5000, 2500, 1000, 500, 250, 100, 50, 25, 10, 5, 2, 1]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[MultinomialNB()],
                                    reduce=reduceamount,
                                    splits=5,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-twitter-trained.pickle', 'wb'))
print(results)


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089

		Training size: 0.05
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.757454
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.757709
Starting to train <class 'sk

  f = msb / msw


Features after reduction: (2159, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.757478
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.Multinom

Features after reduction: (2159, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.758149
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.758126
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.757640

		Training size: 0.05
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.759793
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklear

Features before reduction: (10798, 3145772)
Features after reduction: (10798, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.763450
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.763682
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.764353
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.762594
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 100000)
Starting to train <class 'sklearn.naive_bayes.Mult

Features after reduction: (10798, 50000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.765788
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 50000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.766321
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 50000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.765650

		Training size: 0.1
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 50000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.798708
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 50000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn

Features before reduction: (21597, 3145772)
Features after reduction: (21597, 25000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.805144
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 25000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.803686
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 25000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.797782
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 25000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.805700
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 25000)
Starting to train <class 'sklearn.naive_bayes.Multinomi

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.778799
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 10000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.781762
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 10000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.778220

		Training size: 0.2
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 10000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.764284
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 10000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.762385
Features before reduction: (43195, 3145772)
Featur

Features before reduction: (43195, 3145772)
Features after reduction: (43195, 5000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.749282
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 5000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.751644
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 5000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.750486
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 5000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.751968
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 5000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.742268
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 2500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.747500
Features before reduction: (43195, 3145772)
Features after reduction: (43195, 2500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.747893

		Training size: 0.4
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 2500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.736665
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 2500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.737429
Features before reduction: (86390, 3145772)
Features a

Features before reduction: (86390, 3145772)
Features after reduction: (86390, 1000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.731480
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 1000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.734281
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 1000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.733586
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 1000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.737985
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 1000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.752547
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.747153
Features before reduction: (86390, 3145772)
Features after reduction: (86390, 500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.748912

		Training size: 0.6
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.748565
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 500)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.758126
Features before reduction: (129586, 3145772)
Features 

Features after reduction: (129586, 250)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.770002
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 250)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.771483
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 250)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.773289
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 250)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.769701
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 250)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.Multinomia

Features before reduction: (129586, 3145772)
Features after reduction: (129586, 100)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.765256
Features before reduction: (129586, 3145772)
Features after reduction: (129586, 100)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.764724

		Training size: 0.8
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 100)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.759330
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 100)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.758033
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 100)
Starting to train <class 'sklearn

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.751713
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 50)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.757269
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 50)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.754445
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 50)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.752338
Features before reduction: (172781, 3145772)
Features after reduction: (172781, 50)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.755950

		Reduction: 25

		Training size: 0.01
Features before reduction: (2159, 31

Features before reduction: (172781, 3145772)
Features after reduction: (172781, 25)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.758195

		Reduction: 10

		Training size: 0.01
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 10)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.764052
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 10)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.780975
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 10)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.768844
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 10)
Starting to train <class 'sklearn.na

Features before reduction: (2159, 3145772)
Features after reduction: (2159, 5)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.765140
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 5)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.785374
Features before reduction: (2159, 3145772)
Features after reduction: (2159, 5)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.788499

		Training size: 0.05
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 5)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.788059
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 5)
Starting to train <class 'sklearn.naive_bayes.MultinomialN

Features before reduction: (10798, 3145772)
Features after reduction: (10798, 2)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 2)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 2)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 2)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 2)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklea

Features before reduction: (10798, 3145772)
Features after reduction: (10798, 1)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (10798, 3145772)
Features after reduction: (10798, 1)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089

		Training size: 0.1
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 1)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 1)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.756089
Features before reduction: (21597, 3145772)
Features after reduction: (21597, 1)
Starting to train <class 'sklearn.naive_bayes.Multino

In [10]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-twitter-reduction-trainsize-accuracy-mnbayes.json", old_format=True)

In [11]:
xyz

[(0, 0.01, 0.75608852671543658),
 (0, 0.05, 0.75782016853412348),
 (0, 0.1, 0.75826002407630333),
 (0, 0.2, 0.76025557922029807),
 (0, 0.4, 0.76826558014630986),
 (0, 0.6, 0.79662005741272335),
 (0, 0.8, 0.82396518196129276),
 (1000000, 0.01, 0.75670432447448843),
 (1000000, 0.05, 0.75793591999259191),
 (1000000, 0.1, 0.76092693767941477),
 (1000000, 0.2, 0.77343272525233808),
 (1000000, 0.4, 0.79623576257060835),
 (1000000, 0.6, 0.81157514584683776),
 (1000000, 0.8, 0.82016390406519124),
 (500000, 0.01, 0.75781553847578476),
 (500000, 0.05, 0.75981572367811834),
 (500000, 0.1, 0.77026576534864344),
 (500000, 0.2, 0.78172978979535146),
 (500000, 0.4, 0.79771275118066476),
 (500000, 0.6, 0.80751921474210575),
 (500000, 0.8, 0.82546069080470408),
 (100000, 0.01, 0.75829706454301316),
 (100000, 0.05, 0.76327900731549225),
 (100000, 0.1, 0.77177979442540978),
 (100000, 0.2, 0.81099638855449585),
 (100000, 0.4, 0.81494119825909817),
 (100000, 0.6, 0.80456523752199272),
 (100000, 0.8, 0.7982

### Train logistic

In [12]:
from sklearn.linear_model import LogisticRegression

results = []
for reduceamount in [0]:#, 1000000]:#, 500000, 100000, 50000]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.8]:#[0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[LogisticRegression(n_jobs=-1)],
                                    reduce=reduceamount,
                                    splits=2,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-twitter-trained-log.pickle', 'wb'))
print(results)


		Reduction: 0

		Training size: 0.8
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 415	Score:	0.869502
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 417	Score:	0.873229
[(0, 0.8, [(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 415.732046, 0.86950180572275215), (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 417.592044, 0.8732290026854338)])]


In [18]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-twitter-reduction-trainsize-accuracy-log.json", old_format=True)
xyz

[(0, 0.8, 0.87136540420409303)]

### Test with a saved classifier on a list of strings

In [13]:
results = pickle.load(open('pickled/-twitter-trained-log.pickle', 'rb'))
print(results)

[(0, 0.8, [(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 415.732046, 0.86950180572275215), (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 417.592044, 0.8732290026854338)])]


In [14]:
dvp = pickle.load(open('pickled/-twitter-dvp.pickle', 'rb'))

In [17]:
classifier = results[0][2][1][0] #best logistic

In [16]:
a = ml.predict(["",
           ],
           classifier,
           dvp,
           nlp.cleanTokensTwitter)
sum(a['prediction'])

13