In [1]:
import ml
import nlp
import json_io
import pickle
from itertools import chain
from dvs import DictVectorizerPartial
import numpy as np

### Process tweets, if n is ommitted it processes all of them, sets lable and saves the processed tweets

In [2]:
path = ml.JSON_DIR+"reddit/"
sarcastic_path = path+"sarcastic/"
serious_path = path+"serious/"
source = '-reddit-'
features_path = 'features/'
n=10

In [None]:
json_io.processRandomizeJson(sarcastic=True,
                     json_path=sarcastic_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)
json_io.processRandomizeJson(sarcastic=False,
                     json_path=serious_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)

### Load random set of features

In [3]:
sarcasticFeats = json_io.loadProcessedFeatures(features_path,
                                       source,
                                       sarcastic=True,
                                       n=5,
                                       random=False)
seriousFeats = json_io.loadProcessedFeatures(features_path,
                                     source,
                                     sarcastic=False,
                                     n=3,
                                     random=False)
features = chain(sarcasticFeats, seriousFeats)

### Flatten feature dictionaries, if leaveout is a feature that feature is ommitted

In [4]:
dvp = DictVectorizerPartial()

In [5]:
(X,y) = ml.split_feat(features, 2)

In [6]:
(X,y) = ml.flatten(X,y)

In [7]:
(X,y) = (dvp.partial_fit_transform(X), np.array(list(y)))

In [12]:
pickle.dump(dvp, open('pickled/-reddit-dvp.pickle', 'wb'))
pickle.dump(y, open('pickled/-reddit-y.pickle', 'wb'))
pickle.dump(X, open('pickled/-reddit-X.pickle', 'wb'))

In [2]:
X = pickle.load(open('pickled/-reddit-X.pickle', 'rb'))
y = pickle.load(open('pickled/-reddit-y.pickle', 'rb'))

### Train and test, reports results

In [22]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
classifiers = [
    #LogisticRegression(n_jobs=-1),
    #SGDClassifier(loss='log'),
    BernoulliNB(),
    MultinomialNB(),
]

In [23]:
results = []
for reduceamount in [0, 1000000, 500000, 100000, 50000, 25000, 10000, 7500, 5000, 2500, 1500, 1000, 750, 500, 250, 100, 50, 10, 5]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append(
            ml.trainTest(X,
                         y,
                         classifiers=classifiers,
                         reduce=0,
                         splits=2,
                         trainsize=0.1,
                         testsize=0.2))
print(results)    


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.713005
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704158
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.711724
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.701897

		Training size: 0.05
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.710530
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.700971
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.712194
Starting to train <class 'sklearn.naive_baye

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.701501
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.714642
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704620

		Training size: 0.1
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.707250
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.698550
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.709906
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.699623

		Training size: 0.2
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.7

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702273

		Training size: 0.2
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.712623
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.703795
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.709343
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702246

		Training size: 0.4
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.709511
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.699757
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.7

<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.710014
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.700415
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.711530
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702192

		Training size: 0.6
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.708317
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.700455
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.713857
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702266

		Training si

Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.708035
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.700267

		Training size: 0.8
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.708122
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.698395
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.713374
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.703212

		Reduction: 2500

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.711637
Starting to train <class 'sklearn.naive_ba

<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704721

		Reduction: 1000

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.711530
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.702018
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.710564
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.702145

		Training size: 0.05
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.710946
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.701595
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB

<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.715447
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.706257
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.710624
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702266

		Training size: 0.1
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.711094
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702031
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.716118
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.705593

		Training si

Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.709973
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.702259

		Training size: 0.2
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.713234
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.703453
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.713978
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704352

		Training size: 0.4
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.714267
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>


<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.699616

		Training size: 0.4
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.709866
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.700140
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.707572
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.697148

		Training size: 0.6
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 1	Score:	0.713475
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704352
Starting to train <class 'sklearn.naive_bayes.BernoulliNB'>
<class 'sklearn.naive_bayes.BernoulliNB'>	Time: 0	Score:	0.7