# Imports

In [2]:
import sqlite3, time, csv, re, random, string
import time
import codecs
import numpy as np
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from langdetect import detect, DetectorFactory
from scipy.sparse import csr_matrix
DetectorFactory.seed = 0
import warnings
warnings.filterwarnings("ignore") 

# 1. Data Reading and Cleaning

Connecing to SQL database and downloading posts.
Posts are split into two dataframes: sarcastic and not sarcastic.

In [2]:
sql_conn = sqlite3.connect('/home/IT/Documenti/ML/reddit-comments-may-2015/database.sqlite')

In [3]:
df_sarc = pd.read_sql("SELECT ups, name, subreddit, id, gilded, downs, score, body, controversiality, parent_id FROM May2015 WHERE LENGTH(body) > 20 AND LENGTH(body) < 200 AND body LIKE '% /s' LIMIT 50000", sql_conn)

In [5]:
df_ser = pd.read_sql("SELECT ups, name, subreddit, id, gilded, downs, score, body, controversiality, parent_id FROM May2015 WHERE LENGTH(body) > 20 AND LENGTH(body) < 200 AND body NOT LIKE '% /s' LIMIT 25000", sql_conn)

removing "/s" labels from sarcastic posts.

In [6]:
df_sarc['body'].replace(value = '', regex = ' /s', inplace = True)

Labeling dataframes.

In [7]:
df_sarc['is_sarc'] = [1]*len(df_sarc)
df_ser['is_sarc'] = [0]*len(df_ser)

Joining the two dataframes, in order to clean them at once. 

In [9]:
data = shuffle(df_ser.append(df_sarc)).reset_index(drop = True)

Applying cleaner function, which removes strange symbols, numbers, urls, newlines and carriage returns.

In [56]:
def cleaner(text):
    text = ''.join(word for word in text if ord(word)<128 and not i.isdigit())
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
    text = re.sub("\n"," ",text)
    text = re.sub("\r"," ",text)
    text = ''.join(i for i in text if not i.isdigit())
    return text

In [57]:
data['body'] = data['body'].apply(cleaner)

Removing not english posts.

In [11]:
for i in range(0, len(data)):
    try:
        if detect(data["body"][i]) != "en":
            data.drop(i, inplace = True)
    except:
        data.drop(i, inplace = True)

Dropping columns I don't need for classificaion.

In [12]:
data.drop(["name","id","parent_id","ups","downs", "score",
                          "gilded","distinguished","controversiality","parent_id"], axis = 1, inplace = True)

# Model Building and Training

Splitting data in train and test

In [6]:
margin= int(4/5*len(data))

In [7]:
text_train= data[:margin]
text_test = data[margin:]
y_train = text_train["is_sarc"]
y_test = text_test["is_sarc"]
text_train = text_train.drop("is_sarc", axis = 1)
text_test = text_test.drop("is_sarc", axis = 1)

Defining classification models which will be tested one against the other.

In [9]:
models = {'LR' : LogisticRegression(), 'NB' : MultinomialNB()}

Making a Bag of Words with unigrams and bigrams out of the body of the training set. Deleting stopwords.

In [10]:
vect = CountVectorizer(min_df = 5, stop_words = "english", ngram_range = (1, 2)).fit(text_train["body"])
X_train = vect.transform(text_train['body'])

For each model, finding best parameters. For each best-paramter estimator choose most informative ngrams. Adding to the text of the post the most informative ngrams with subreddit concatenated. As an example "hello i am italian" in the subrreddit "greetings", supposing "hello" and "italian" are informative, becomes "hello I am italian hello_greetings italian_greetings hello_italian_greetings". The ratio between that is to consider each word in its context.

In [8]:
def make_concat(label) :
    return lambda row : concat_subreddit(row, label)

def concat_subreddit(row, label) :
    list_of_words = row['body'].split(' ')
    list_of_words = map(lambda word : 
                        word + '_' + row['subreddit'] if word in useful_ngrams[label] else '', list_of_words
                       )
    return ' '.join(list_of_words)

In [11]:
useful_ngrams = {}
dataframes = {}
param_grid = {'LR' : {'C' : [0.001, 0.01, 0.1, 1, 10]}, 'NB' : {'alpha' : [0.001, 0.01, 0.1, 1, 10]}}
for label in models.keys() :
    grid = GridSearchCV(models[label], param_grid[label], cv = 5)
    grid.fit(X_train, y_train)
    words_selector = SelectFromModel(grid.best_estimator_, threshold = 10**(-1)).fit(X_train, y_train)
    useful_ngrams[label] = [list(vect.vocabulary_.keys())[i] for i in words_selector.get_support(indices = True)]
    dataframes[label] = data.copy()
    dataframes[label]['body_subreddit'] = dataframes[label][['body', 'subreddit']].apply(make_concat(label), axis = 1)
    dataframes[label]['text'] = dataframes[label]['body'] + ' ' + dataframes[label]['body_subreddit']

# Testing

In [12]:
X_text = {model : dataframes[model]['text'] for model in models}
y = data['is_sarc']

In [13]:
X = {}
for label in models.keys() :
    X[label] = CountVectorizer(min_df = 5, stop_words = "english", ngram_range = (1, 2)).fit_transform(X_text[label])

# Plots

In [None]:
from sklearn.model_selection import learning_curve
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
models = {'LR' : LogisticRegression(), 'NB' : MultinomialNB()}
scor_functs = ['accuracy', 'precision', 'recall']
train_sizes = {scor : {} for scor in scor_functs}
train_scores = {scor : {} for scor in scor_functs}
valid_scores = {scor : {} for scor in scor_functs}
for label in models.keys() :
    for scor in scor_functs :
        train_sizes[scor][label], train_scores[scor][label], valid_scores[scor][label] = learning_curve(
            GridSearchCV(models[label], param_grid[label]), X[label], y, 
                train_sizes = [ 0.3, 0.7, 0.8, 0.9, 1.], scoring = scor
        )

In [None]:
fig, axes = plt.subplots(3, 1, figsize = (10, 20), sharex = True)
fig.suptitle('Model Comparison')
for i in range(3) :
    for label in models.keys() :
        axes[i].plot(train_sizes[scor_functs[i]][label], np.mean(train_scores[scor_functs[i]][label], 1), label = 'Train' + ' ' + label)
        axes[i].plot(train_sizes[scor_functs[i]][label], np.mean(valid_scores[scor_functs[i]][label], 1), label = 'Test' + ' ' + label)
        axes[i].legend()
for i in range(3) :
    axes[i].set_ylabel(scor_functs[i])
    axes[i].set_xlabel('train size')
#plt.savefig('evaluation_graphs.png')