In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from itertools import islice
import pprint
from hashtag_removal import removeHashtags
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

from basic_nlp import punctuationFeatures
from json_io import tweet_iterate, tweet_map, replaceLinksMentions, list_to_json

pp = pprint.PrettyPrinter(indent=4)
vec = DictVectorizer()

## Datasets

In [None]:
SARCASTIC_PATH = "../json/sarcastic/unique.json"
NON_SARCASTIC_PATH = "../json/non_sarcastic/unique.json"

NUM_TWEETS = 10000
ESTIMATED_TOTAL_TWEETS = 30000
RAND_START = np.random.randint(0, ESTIMATED_TOTAL_TWEETS-NUM_TWEETS)

def sarcasm_map(tweet):
    tweet["sarcastic"] = True
    return tweet
    
def non_sarcasm_map(tweet):
    tweet["sarcastic"] = False
    return tweet

sarcastic_dataset = tweet_map(list(islice(tweet_iterate(SARCASTIC_PATH), RAND_START, RAND_START + NUM_TWEETS)), sarcasm_map)
non_sarcastic_dataset = tweet_map(list(islice(tweet_iterate(NON_SARCASTIC_PATH), RAND_START, RAND_START + NUM_TWEETS)), non_sarcasm_map)

In [None]:
print(len(sarcastic_dataset))
print(len(non_sarcastic_dataset))

## Compute features

In [None]:
def process_tweet(tweet, features):
    
    sarcastic = tweet["sarcastic"]
    tweet["original_text"] = tweet["text"]
    
    tweet["text"] = removeHashtags(tweet)["text"]
    replaceLinksMentions(tweet)
    
    if "punctuation_features" in features:
        tweet["punctuation_features"] = punctuationFeatures(tweet["ner_text"])
    
    return tweet

In [None]:
def process_tweets(tweets, features, step_size=1000, save=False, save_path="../json/"):
    
    flatten = lambda list_of_lsts: [elem for lst in list_of_lsts for elem in lst]
    
    processed = []
    temp_processed = []
    for index, tweet in enumerate(tweets):
        if index % step_size == 0:
            if temp_processed:
                if save:
                    list_to_json(temp_processed, "{}temp_{}.json".format(save_path, index))
                processed.append(temp_processed)
                temp_processed = []
    
                
        temp_processed.append(process_tweet(tweet, features))
    if temp_processed:
        if save:
            list_to_json(temp_processed, "{}temp_{}.json".format(save_path, index))
        processed.append(temp_processed)
    
    return flatten(processed)

In [None]:
def feature_extract(processed_tweets):
    lst = []
    for tweet in processed_tweets:
        d = {}
        d.update(tweet["punctuation_features"])
        d.update({"sarcastic": tweet["sarcastic"]})
        lst.append(d)
    return lst

In [None]:
processed_tweets = process_tweets(sarcastic_dataset+non_sarcastic_dataset, ["punctuation_features"], step_size=250, save=False)

dataframe = pd.DataFrame(data=vec.fit_transform(feature_extract(processed_tweets)).toarray(),
                         columns=vec.get_feature_names())
dataframe.head()

## Test train split

In [None]:
y = dataframe["sarcastic"]
X = dataframe.drop("sarcastic", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=10)

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(211)
ax.set_title("Training dataset (size={})".format(y_train.size))
y_train.hist(ax=ax, grid=False)

ax = fig.add_subplot(212)
ax.set_title("Testing dataset (shape={})".format(y_test.size))
y_test.hist(ax=ax, grid=False)


In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)