In [1]:
from basic_nlp import *
from json_io import *
from ml import *
import datetime
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



## Function to run on each tweet

In [None]:
def keyToStr(d):
    new = {}
    for key,value in d.items():
        new[" ".join(key)] = value
    return new

from re import sub

TWEET_LINK_RE = "https://t.co/(\w)+"
TWEET_HANDLE_RE = "@(\w)+"
HASHTAG_RE = "#(\w)+"

def feature(tweet):
    tweet = sub(TWEET_HANDLE_RE, "NameTOK", tweet)
    tweet = sub(TWEET_LINK_RE, "LinkTOK", tweet)
    tweet = sub(HASHTAG_RE, "", tweet)
    tokens = tokenize(tweet)
    ull = upperLowerLen(tokens)
    cases = wordCases(ull)
    tagged = pos(tokens)
    chunked = chunk(tagged)
    (tokens, postags) = tokNoNE(chunked)
    puncuationFreqDict = punctuationFeatures(tweet)
    suffreq = dict(freq(tokenSuffixes(tokens)))
    normSuffFreq = {}
    norm2SuffFreq = {}
    sumSuf = sum(suffreq.values())
    for key, val in suffreq.items():
        normSuffFreq[key] = val/sumSuf
        norm2SuffFreq[key] = val/len(tokens)
    
    sent = {
        'fullSent' : sentimentGrams([tokens]),
        'halfSent1' : sentimentGrams([tokens[:int(len(tokens)/2)]]),
        'halfSent2' : sentimentGrams([tokens[int(len(tokens)/2):]]),
        'thirdSent1' : sentimentGrams([tokens[:int(len(tokens)/3)]]),
        'thirdSent2' : sentimentGrams([tokens[int(len(tokens)/3):2*int(len(tokens)/3)]]),
        'thirdSent3' : sentimentGrams([tokens[2*int(len(tokens)/3):]]),
        'quartSent1' : sentimentGrams([tokens[:int(len(tokens)/4)]]),
        'quartSent2' : sentimentGrams([tokens[int(len(tokens)/4):2*int(len(tokens)/4)]]),
        'quartSent3' : sentimentGrams([tokens[2*int(len(tokens)/4):3*int(len(tokens)/4)]]),
        'quartSent4' : sentimentGrams([tokens[3*int(len(tokens)/4):]])
    }
    sentCompound = {}
    for key, val in sent.items():
        sentCompound[key+"Vader"] = val[0]['Vader']['compound'] + 1
        sentCompound[key+"LiuHu"] = val[0]['LiuHu']['compound'] + 1
    
    capFreq = capLetterFreq(ull)
    allCapsFreq = cases.count('AC')/len(cases)
    normSuffFreq = keyToStr(normSuffFreq)
    norm2SuffFreq = keyToStr(norm2SuffFreq)
    toksuff = keyToStr(dict(freq(tokenSuffixes(tokens))))
    unigrams = keyToStr(dict(freq(grams(tokens, 1))))
    bigrams = keyToStr(dict(freq(grams(tokens, 2))))
    trigrams = keyToStr(dict(freq(grams(tokens, 3))))
    unigramsPos = keyToStr(dict(freq(grams(postags, 1))))
    bigramsPos = keyToStr(dict(freq(grams(postags, 2))))
    trigramsPos = keyToStr(dict(freq(grams(postags, 3))))
    feat = {}
    feat.update(unigrams)
    feat.update(bigrams)
    feat.update(trigrams)
    feat.update(unigramsPos)
    feat.update(bigramsPos)
    feat.update(trigramsPos)
    feat.update(puncuationFreqDict)
    feat.update(toksuff)
    feat.update(normSuffFreq)
    feat.update(norm2SuffFreq)
    feat.update(sentCompound)
    feat.update({"capFreq":capFreq, "allCapsFreq":allCapsFreq})
    return feat

## Run features on tweets

In [None]:
total = 30000

start1 = datetime.datetime.now()
sarcasticTweets = tweet_iterate("../json/sarcastic/unique.json", key="text")
sarcasticFeats = [(feature(repr(next(sarcasticTweets))), True) for x in range(0,total)]
start2 = datetime.datetime.now()
print((start2-start1).total_seconds())
seriousTweets = tweet_iterate("../json/non_sarcastic/unique.json", key="text")
seriousFeats = [(feature(repr(next(seriousTweets))), False) for x in range(0,total)]
print((datetime.datetime.now()-start2).total_seconds())

## Instantiate timing and accuracy storage arrays

In [2]:
t = []
a = []

## Save features from tweets

In [None]:
featTup = sarcasticFeats + seriousFeats
pickle.dump(featTup, open('pickledfeatures/feats.pickle', 'wb'))
(feats, bools) = list(zip(*featTup))

## Load features from tweets

In [3]:
pk = pickle.load(open('pickledfeatures/feats.pickle', 'rb'))
(feats, bools) = list(zip(*pk))

## Randomly select subset of data

In [18]:
X_trainDict, X_testDict, y_train, y_test = train_test_split(feats, bools, train_size=0.2, test_size=0.2, random_state=0)

dv = DictVectorizer()
X_train = dv.fit_transform(X_trainDict)
X_test = dv.transform(X_testDict)

## Gridsearch

In [10]:
cl = [
    #(RandomForestClassifier(n_jobs=-1), {'n_estimators': list(np.arange(10,100,20)),'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,8],'random_state':[None,0]}),
    #(DecisionTreeClassifier(), {'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,8],'random_state':[None,0]}),
    #(SGDClassifier(), {'loss':['hinge','log','perceptron'], 'penalty':['l2','l1','elasticnet']})
]

In [None]:
for c in cl:
    clf = RandomizedSearchCV(c[0],c[1] , cv=4, n_iter=1)
    start = datetime.datetime.now()
    clf.fit(X_train.toarray(), y_train)
    print((datetime.datetime.now()-start).total_seconds())
    y_true, y_pred = y_test, clf.predict(X_test.toarray())
    print(type(c[0]))
    print(classification_report(y_true, y_pred))
    print(clf.best_params_)

## Select Classifiers

In [20]:
classifier = [
    LogisticRegression(n_jobs=-1),
    SGDClassifier(loss='log', penalty='elasticnet'),
    ##SVC(),
    DecisionTreeClassifier(min_samples_leaf=1, min_samples_split=2),
    RandomForestClassifier(n_jobs=-1, min_samples_leaf=2, min_samples_split= 10, n_estimators=70,random_state=0),
    ##MLPClassifier(alpha=1),
    ###AdaBoostClassifier(),
    BernoulliNB(alpha=0.5, binarize=0.2),
    MultinomialNB(alpha=1.25)
    ]

d = {}
for i, c in enumerate(classifier):
    d[str(type(c))+str(i)] = c
    print(str(i)+"\t"+str(type(c)))
m = ml(d)

0	<class 'sklearn.linear_model.logistic.LogisticRegression'>
1	<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
2	<class 'sklearn.tree.tree.DecisionTreeClassifier'>
3	<class 'sklearn.ensemble.forest.RandomForestClassifier'>
4	<class 'sklearn.naive_bayes.BernoulliNB'>
5	<class 'sklearn.naive_bayes.MultinomialNB'>


## Train Classifiers

In [None]:
trainOutput = m.trainVectorizedFeatures(X_train, y_train)
t.append(dict([(k,v[1].total_seconds()) for k,v in trainOutput.items()]))

total = 0
for w in sorted(t[-1], key=t[-1].get, reverse=True):
    total += t[-1][w]
    print(w, t[-1][w])
print("Total: ", total)

## Test Classifiers

In [None]:
a.append(m.accuracyVectorizedFeatures(X_test, y_test))
for w in sorted(a[-1], key=a[-1].get, reverse=True):
    print(w, a[-1][w])

In [None]:
m.save(pickledir="pickledclassifiers")