In [0]:
subreddit_train = "coursework_subreddit_train.json"
subreddit_test = "coursework_subreddit_test.json"

Copying gs://textasdata/coursework/coursework_subreddit_train.json...
/ [1 files][ 10.1 MiB/ 10.1 MiB]                                                
Operation completed over 1 objects/10.1 MiB.                                     
Copying gs://textasdata/coursework/coursework_subreddit_test.json...
/ [1 files][  2.7 MiB/  2.7 MiB]                                                
Operation completed over 1 objects/2.7 MiB.                                      


# Subreddit prediction

This section attempts to predict the type of subreddit a post is being located.

In [0]:
import json
import eli5
import collections
import itertools
import pandas as pd
import spacy
import nltk
import numpy as np
import scipy.sparse
import gensim
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import coo_matrix, hstack
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
import warnings

warnings.filterwarnings('ignore')

In [0]:
test_threads = pd.read_json(path_or_buf=subreddit_test, lines=True)
train_threads = pd.read_json(path_or_buf=subreddit_train, lines=True)

In [0]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6f5b237408>)

In [0]:
def frames(local_file):
    
    posts_tmp = list()
    
    with open(local_file) as jsonfile:
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)
            subs = ''
            titles = ''
            url = ''
            tid = ''
            author = ''
            body = ''
      
        for post in thread['posts']: 
            subs = thread['subreddit']
            titles = thread['title']
            url = thread['url']
            tid += " - "+ post['id']
            author += " - "+ post.get('author', "")
            body += " - "+ post.get('body', "")
        
    posts_tmp.append((subs, titles, url,tid, author, body))
    labels = ['subreddit', 'title', 'url', 'id', 'author', 'body']
    post_frame = pd.DataFrame(posts_tmp, columns=labels)

    return post_frame

In [0]:
train_frame = frames(subreddit_train)
test_frame = frames(subreddit_test)

# Text preprocessing

In [0]:
def spacy_tokenize(string):
    tokens = list()
    doc = nlp(string)
    for token in doc:
        tokens.append(token)
    return tokens

def normalize(tokens):
    normalized_tokens = list()
    for token in tokens:
        normalized = token.text.lower().strip()
        if ((token.is_alpha or token.is_digit)):
            normalized_tokens.append(normalized)
    return normalized_tokens

def tokenize_normalize(string):
    return normalize(spacy_tokenize(string))

In [0]:
class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

# One hot encoding

In [0]:
prediction_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[

              ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('one-hot', CountVectorizer(tokenizer=tokenize_normalize, binary=True)), 
              ])),
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('one-hot', CountVectorizer(tokenizer=tokenize_normalize, binary=True)), 
              ])),
           ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('one-hot', CountVectorizer(tokenizer=tokenize_normalize, binary=True)), 
              ])),
        ])
        )
    ])

In [0]:
one_hot_train_features = prediction_pipeline.fit_transform(train_frame)
one_hot_test_features = prediction_pipeline.transform(test_frame)

# Models

In [0]:
cb = BernoulliNB()
cb.fit(one_hot_train_features, train_frame['subreddit'])
y_pred = cb.predict(one_hot_test_features)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.06893084622383985
recall: 0.08476512226512227
fscore: 0.06338820800836763


In [0]:
lr2 = LogisticRegression()
lr2.fit(one_hot_train_features,train_frame['subreddit'])
y_pred = lr2.predict(one_hot_test_features)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.7107111768287452
recall: 0.5585096881883966
fscore: 0.5913528192003266


In [0]:
clf3 = svm.SVC()
clf3.fit(one_hot_train_features,train_frame['subreddit'])
y_pred =  clf3.predict(one_hot_test_features)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.23031464461710555
recall: 0.1836572492822493
fscore: 0.1709624676825674


As seen on the table below the best performing model with one hot encoding is __Logistic Regression__ .

| Model | f1score |       
| :- |-------------: |
|BernouliNB| 0.0633| 
|Logistic Regression| 0.5913| 
|SVC| 0.1709| 

# Tfidf

In [0]:
tfIdf_prediction_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
           ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
        ])
        )
    ])

In [0]:
tf_id_train_features = tfIdf_prediction_pipeline.fit_transform(train_frame)
tf_idf_test_features = tfIdf_prediction_pipeline.transform(test_frame)

In [0]:
cbtf = BernoulliNB()
cbtf.fit(tf_id_train_features, train_frame['subreddit'])
y_pred = cbtf.predict(tf_idf_test_features)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.06893084622383985
recall: 0.08476512226512227
fscore: 0.06338820800836763


In [0]:
lr = LogisticRegression()
lr.fit(tf_id_train_features,train_frame['subreddit'])
y_pred = lr.predict(tf_idf_test_features)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.6556506213911698
recall: 0.39359614937779897
fscore: 0.42667097632307094


In [0]:
sv1 = svm.SVC()
sv1.fit(tf_id_train_features,train_frame['subreddit'])
y_pred = sv1.predict(tf_idf_test_features)
precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.32662791467938523
recall: 0.22652035138448184
fscore: 0.21126724275499886


As seen on the table below the best performing model with tfidf is __Logistic Regression__ .

| Model | f1score |       
| :- |-------------: |
|BernouliNB| 0.0633| 
|Logistic Regression| 0.4266| 
|SVC| 0.2112| 

# Best params - RandomizedSearchCV

In [0]:
improved_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,2), 
                                        sublinear_tf=False)),
              ])),
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,3), 
                                        sublinear_tf=False)),
              ])),
          ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,2), 
                                        sublinear_tf=True)),
              ])),
        ])
        ),('logreg', LogisticRegression(C=1000, multi_class='auto', solver = 'sag'))
    ])

In [0]:
params = {
    'logreg__C':[0.001, 0.01, 0.1, 1, 10 , 100, 1000, 10000],
    'logreg__multi_class' : ['ovr', 'multinomial','auto'],
    'logreg__solver' : ['saga' , 'sag' , 'lbfgs', 'newton-cg'],
    'union__author__TfIdf__max_features' :  (10000, 50000),
    'union__body__TfIdf__max_features' :  (10000, 50000),
    'union__title__TfIdf__max_features' :  (10000, 50000),  
    'union__body__TfIdf__ngram_range' :  ((1,2), (1,3)),
    'union__title__TfIdf__ngram_range' :  ((1,2), (1,3)),
    'union__author__TfIdf__ngram_range' :  ((1,2), (1,3)),
    'union__author__TfIdf__sublinear_tf' :  (False, True),
    'union__body__TfIdf__sublinear_tf' :  (False, True),
    'union__title__TfIdf__sublinear_tf' :  (False, True)
}

grid_search = RandomizedSearchCV(improved_pipeline, param_distributions=params, n_jobs=1, verbose=1, scoring='f1_macro', cv=2)
print("Performing grid search...")
print("pipeline:", [name for name, _ in improved_pipeline.steps])
print("parameters:")
print(params)
grid_search.fit(train_frame, train_frame['subreddit'])

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'logreg']
parameters:
{'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'logreg__multi_class': ['ovr', 'multinomial', 'auto'], 'logreg__solver': ['saga', 'sag', 'lbfgs', 'newton-cg'], 'union__author__TfIdf__max_features': (10000, 50000), 'union__body__TfIdf__max_features': (10000, 50000), 'union__title__TfIdf__max_features': (10000, 50000), 'union__body__TfIdf__ngram_range': ((1, 2), (1, 3)), 'union__title__TfIdf__ngram_range': ((1, 2), (1, 3)), 'union__author__TfIdf__ngram_range': ((1, 2), (1, 3)), 'union__author__TfIdf__sublinear_tf': (False, True), 'union__body__TfIdf__sublinear_tf': (False, True), 'union__title__TfIdf__sublinear_tf': (False, True)}
Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  5.2min finished


Best score: 0.478
Best parameters set:
	logreg__C: 1000
	logreg__multi_class: 'auto'
	logreg__solver: 'lbfgs'
	union__author__TfIdf__max_features: 50000
	union__author__TfIdf__ngram_range: (1, 2)
	union__author__TfIdf__sublinear_tf: False
	union__body__TfIdf__max_features: 10000
	union__body__TfIdf__ngram_range: (1, 3)
	union__body__TfIdf__sublinear_tf: False
	union__title__TfIdf__max_features: 10000
	union__title__TfIdf__ngram_range: (1, 2)
	union__title__TfIdf__sublinear_tf: True


# Improved model

In [0]:
improved_pipeline.fit(train_frame, train_frame['subreddit'])
y_predict = improved_pipeline.predict(test_frame)

precision, recall, fscore, support = score(test_frame['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.23031464461710555
recall: 0.1836572492822493
fscore: 0.1709624676825674


The __improved model__ with the acquired parameters scored the following:

| Model | f1score |
| :- |-------------: |
|Improved Logistic Regression| 0.1709|

# Feature development - post depth / average length

Two features were developped in order to attempt enhancing the model's performance. The features are the following;

1. __post depth__ : The nested length of a post in a subreddit.

2. __average length__: The average length of a post.

In [0]:
def features2(local_file):
    
    posts_tmp = list()
    
    with open(local_file) as jsonfile:
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)
            bodies = ""
            subs = ''
            titles = ''
            url = ''
            tid = ''
            author = ''
            body = ''
            
        for post in thread['posts']:
            subs = thread['subreddit']
            titles = thread['title']
            url = thread['url']
            tid += " " + post['id']
            author += " " + post.get('author', "")
            body += " " + post.get('body', "")
            bodies = bodies +"" + post.get('body', "")
            thread_length = len(bodies)
            post_depth = post.get('post_depth', 0)
        
    posts_tmp.append((subs, titles, author, body, post_depth, thread_length))
    labels = ['subreddit', 'title', 'author', 'body','post_depth', 'thread_length']
    post_frame = pd.DataFrame(posts_tmp, columns=labels)
  
    return post_frame

In [0]:
imp_train = features2(subreddit_train)
imp_test = features2(subreddit_test)

In [0]:
improved_pipeline_new_feature = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,2), 
                                        sublinear_tf=False)),
              ])),
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,3), 
                                        sublinear_tf=False)),
              ])),
          ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=10000, ngram_range=(1,2), 
                                        sublinear_tf=True)),
              ])),
        ])
        ),
#     ('logreg', LogisticRegression(C=1000, multi_class='auto', solver = 'saga'))
    ])

In [0]:
x_new = improved_pipeline_new_feature.fit_transform(imp_train, imp_train['subreddit'])
x_test =improved_pipeline_new_feature.transform(imp_test)
x_train_new = hstack([x_new, imp_train.post_depth.values.reshape(1,1456).T.astype(float)])
x_test_new = hstack([x_test, imp_test.post_depth.values.reshape(1,365).T.astype(float)])

clf = LogisticRegression(C=1000, multi_class='auto', solver = 'saga')
clf.fit(x_train_new,imp_train['subreddit'])
predicts = clf.predict(x_test_new)

precision, recall, fscore, support = score(imp_test['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.23031464461710555
recall: 0.1836572492822493
fscore: 0.1709624676825674


In [0]:
x_new = improved_pipeline_new_feature.fit_transform(imp_train, imp_train['subreddit'])
x_test =improved_pipeline_new_feature.transform(imp_test)
x_train_new = hstack([x_new, imp_train.thread_length.values.reshape(1,1456).T.astype(float)])
x_test_new = hstack([x_test, imp_test.thread_length.values.reshape(1,365).T.astype(float)])

clf = LogisticRegression(C=1000, multi_class='auto', solver = 'saga')
clf.fit(x_train_new,imp_train['subreddit'])
predicts = clf.predict(x_test_new)

precision, recall, fscore, support = score(imp_test['subreddit'], y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.23031464461710555
recall: 0.1836572492822493
fscore: 0.1709624676825674


# Reddit discourse prediction

This section attempts to predict the type of a discourse occuring within a reddit post. The type good be anything between agreeement, humohr, answer, negative reaction etc.

In [0]:
discourse_train = "coursework_discourse_train.json"
discourse_test = "coursework_discourse_test.json"

Copying gs://textasdata/coursework/coursework_discourse_train.json...
| [1 files][ 60.2 MiB/ 60.2 MiB]                                                
Operation completed over 1 objects/60.2 MiB.                                     
Copying gs://textasdata/coursework/coursework_discourse_test.json...
- [1 files][ 15.1 MiB/ 15.1 MiB]                                                
Operation completed over 1 objects/15.1 MiB.                                     


In [0]:
def load_posts(file):
    
    posts_tmp = list()
    
    with open(file) as jsonfile:
        
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)

    for post in thread['posts']:
    posts_tmp.append((thread['subreddit'], thread['title'], thread['url'],
                    post['id'], post.get('author', ""), post.get('body', ""), post.get("majority_link", ""), 
                    post.get('post_depth', 0), post.get('majority_type', ""),
                    post.get('in_reply_to', "") ))

    labels = ['subreddit', 'title', 'url', 'id', 'author', 'body', 'majority_link', 'post_depth', 'discourse_type', 'in_reply_to']

    return pd.DataFrame(posts_tmp, columns=labels)

In [0]:
train_posts = load_posts(discourse_train)
test_posts = load_posts(discourse_test)
train_posts = train_posts[train_posts['discourse_type'] != ""]
test_posts = test_posts[test_posts['discourse_type'] != ""]

# Discourse model

In [0]:
discourse_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=None, ngram_range=(1,1), 
                                        sublinear_tf=True)),
              ])),
            ('subreddit', Pipeline([
              ('selector', ItemSelector(key='subreddit')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=None, ngram_range=(1,1), 
                                        sublinear_tf=True)),
              ])),
          ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('TfIdf', TfidfVectorizer(tokenizer=tokenize_normalize, 
                                        max_features=None, ngram_range=(1,1), 
                                        sublinear_tf=True)),
              ])),
        ])
        ),('logreg', LogisticRegression(C=10000, multi_class='multinomial', solver = 'lbfgs')),
    ])

In [0]:
X_train = train_posts[['author','body','subreddit']]
X_test = test_posts[['author','body','subreddit']]

In [0]:
discourse_pipeline.fit(X_train, train_posts['discourse_type'])
discourse_pipeline.predict(X_test)
y_pred = discourse_pipeline.predict(X_test)

precision, recall, fscore, support = score( test_posts['discourse_type'],y_pred, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.33480985697809784
recall: 0.2743430065918857
fscore: 0.29182960553710713


The __Logistic Regression__ model used for the discourse prediction scored the following:

| Model | f1score |
| :- |-------------: |
|Logistic Regression| 0.2918|

## Discourse feature engineering

A number of new features will be created from the existis data in order to attempt enhancing th epredicting power power of the model. The features wll be tested individually and combined. The following feautres have been created.

1. __Total comments__ : Number of total comments within a post.

2. __Type of subreddit__ : Type of subreddit the post belongs to.

3. __Post depth__ : Nested length of post.

4. __Body length__ : Number of characters in the post.

5. __Same author as top level__ : If its a self post.

6. __Punctuation tokenization__ : Tokenize punction etc.

In [0]:
def tokenize_normalize_punct(string):
    return normalize2(spacy_tokenize(string))

def normalize2(tokens):
    normalized_tokens = list()
    
    for token in tokens:
        normalized = token.text.lower().strip()
        
    if ((token.is_alpha or token.is_digit or token.is_punct )):
        normalized_tokens.append(normalized)
        
    return normalized_tokens

In [0]:
def comments(df):
    return df['total_comments'].values.reshape(len(df),1)

call_total_comments = FunctionTransformer(comments, validate=False)

def selfauthor(df):
    return df['same_auth'].values.reshape(len(df),1)

call_self_author = FunctionTransformer(selfauthor, validate=False)

In [0]:
def postz(file):
    posts_tmp = list()
    
    with open(file) as jsonfile:
        
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)
            c = 0
            post_length =0
            body_length=0
            total_comments=0   
        
        for post in thread['posts']:          
            body = post.get('body', "")
            post_length += len(body)
            total_comments +=1
            c+=1
            
        firstPost = True      
        for post in thread['posts']:       
            body = post.get('body', "")
            body_length = len(body)
            post_depth =''        
                  
        same_auth = '0'
        
        if (thread['is_self_post'] == None):
            thread['is_self_post'] = 0
            
        thread['top_author'] = thread['posts'][0].get('author',"")     
        
        if(thread['top_author']==post.get('author',"")):
            same_auth='1'
          
        if post.get('post_depth',"") == 1:
            post_depth='A'
        elif post.get('post_depth',"")==2:
            post_depth='B'
        elif post.get('post_depth',"")==3:
            post_depth='C'
        elif post.get('post_depth',"")==4:
            post_depth='D'
        elif post.get('post_depth',"")==5:
            post_depth='E'
        else:
            post_depth='F'                    
        if post.get('is_self_post',"")==1.0:
            self_post= 'True'
        else:
            self_post = 'False'             
        firstPost=False
        
        length_group=''
        if(body_length < 100):
            length_group = 'Small'
        elif(body_length > 100 & body_length< 500):
            length_group = 'Average'
        elif(body_length > 500 & body_length< 1000):
            length_group = 'Long'
        else:
            length_group = 'Too Long'

          
    total_comments = len(thread['posts'])
    sub_auth = thread['subreddit'] + "," +  post.get('author', "")
    posts_tmp.append((self_post,thread['subreddit'], thread['title'], thread['url'],
                    post['id'], post.get('author', ""),body, post.get("majority_link", ""), 
                    post.get('post_depth', 0), post.get('majority_type', ""),  
                    post.get('in_reply_to', ""), post_depth, total_comments, length_group, sub_auth, same_auth))

    labels = ['selfpost','subreddit', 'title', 'url', 'id', 'author', 'body', 'majority_link', 
          'post_depth', 'discourse_type', 'in_reply_to', 'post_depthcat', 'total_comments','body_length','sub_auth','same_auth']
    return pd.DataFrame(posts_tmp, columns=labels)

In [0]:
train_posts_4 = postz(discourse_train)
test_posts_4 = postz(discourse_test)

train_labels_2 = train_posts_4['discourse_type']
test_labels_2 = test_posts_4['discourse_type']

# Feature 1 - Total comments

In [0]:
feature1 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('Total_comments', call_total_comments),             
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature1.fit(train_posts_4, train_labels_2)
f1_pred = feature1.predict(test_posts_4)

precision, recall, fscore, support = score(f1_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.20485760471269288
recall: 0.29769542354788087
fscore: 0.20987265667041302


# Feature 2 - Subreddit

In [0]:
feature2 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('subreddit', Pipeline([
              ('selector', ItemSelector(key='subreddit')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),            
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature2.fit(train_posts_4, train_labels_2)
f2_pred = feature1.predict(test_posts_4)

precision, recall, fscore, support = score(f2_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.20485760471269288
recall: 0.29769542354788087
fscore: 0.20987265667041302


# Feature 3 - Post depth

In [0]:
feature3 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('post_depthcat', Pipeline([
              ('selector', ItemSelector(key='post_depthcat')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)),
              ])),            
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature3.fit(train_posts_4, train_labels_2)
f3_pred = feature3.predict(test_posts_4)

precision, recall, fscore, support = score(f3_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.2887597008373968
recall: 0.33282390985210586
fscore: 0.29922508937478526


# Feature 4 - Body length

In [0]:
feature4 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('body_length', Pipeline([
              ('selector', ItemSelector(key='body_length')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),            
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature4.fit(train_posts_4, train_labels_2)
f4_pred = feature4.predict(test_posts_4)

precision, recall, fscore, support = score(f4_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.2134718990937028
recall: 0.26014870220896325
fscore: 0.22459078097529203


# Feature 5 - Same author as top level

In [0]:
feature5 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('same_auth', Pipeline([
              ('selector', ItemSelector(key='sub_auth')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),            
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature5.fit(train_posts_4, train_labels_2)
f5_pred = feature5.predict(test_posts_4)

precision, recall, fscore, support = score(f5_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.21531899052410378
recall: 0.25819551692072545
fscore: 0.22601696678350655


# Feature 6 - Punctuation tokenization

In [0]:
feature6 = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
            ('body_punc', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize_punct)), 
              ])),            
        ])
        ),
       ('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
feature6.fit(train_posts_4, train_labels_2)
f6_pred = feature6.predict(test_posts_4)

precision, recall, fscore, support = score(f6_pred, test_labels_2, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.2504415186557159
recall: 0.29583926188301146
fscore: 0.2625115587745551


# Feature combination

In [0]:
def loader(file):
    
    posts_tmp = list()

    with open(file) as jsonfile:
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)    
            c = 0
            avg_body_length=0.0
            post_length = 0
            body_length=0
            total_comments=0
            
        for post in thread['posts']:        
            body = post.get('body', "")
            post_length += len(body)
            total_comments +=1
            c+=1  
        
        for post in thread['posts']:
            body = post.get('body', "")
            body_length = len(body)
            post_depth =''
            length_group=''
            
        if ((post.get('post_depth',"") == 1) or (post.get('post_depth',"") == 2)):
            post_depth= 'small'
        elif ((post.get('post_depth',"") == 3) or (post.get('post_depth',"") == 4)):
            post_depth='medium'
        else:
            post_depth='large'   
        if post.get('is_self_post',"") == 1.0:
            self_post= 'True'
        else:
            self_post = 'False' 
            
        same_auth=0
        
        if( thread['is_self_post'] == None):
            thread['is_self_post'] = 0
            
        thread['top_author']=thread['posts'][0].get('author',"")
        
        if(thread['top_author']==post.get('author',"")):
            same_auth=1  
            
    total_comments = len(thread['posts'])
    sub_auth = thread['subreddit'] + "," +  post.get('author', "")
    posts_tmp.append((self_post,thread['subreddit'], thread['title'], thread['url'],
                    post['id'], post.get('author', ""),body, 
                    post.get('post_depth', 0), post.get('majority_type', ""),  
                    post_depth, total_comments, length_group, sub_auth, same_auth))

    labels = ['selfpost','subreddit', 'title', 'url', 'id', 'author', 'body',
            'discourse_type', 'post_depth', 'post_size', 'total_comments','body_length','sub_auth','same_auth']
  
    return pd.DataFrame(posts_tmp, columns=labels)

In [0]:
train_posts = loader(discourse_train)
test_posts = loader(discourse_test)
train_labels = train_posts['discourse_type']
test_labels = test_posts['discourse_type']

In [0]:
combined_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[              
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),              
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),                          
              ('subreddit', Pipeline([
              ('selector', ItemSelector(key='subreddit')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),
              
            ('body_length', Pipeline([
              ('selector', ItemSelector(key='sub_auth')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize_punct)), 
              ])),
            ('Total_comments', call_total_comments),                        
            ('post_size', Pipeline([
              ('selector', ItemSelector(key='post_size')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize)), 
              ])),             
            ('body_punc', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_normalize_punct)), 
              ])),
             
            ('self_author', call_self_author),
             
        ])
        ),
       #('classifier', LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial'))
    ])

In [0]:
train_features = combined_pipeline.fit_transform(train_posts)
test_features = combined_pipeline.transform(test_posts)

Lr = LogisticRegression(C=10000, solver = 'saga', multi_class = 'multinomial')
Lr_model = Lr.fit(train_features,train_labels)
y_pred = Lr.predict(test_features)

precision, recall, fscore, support = score(y_pred, test_labels, average = 'macro')

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: 0.40338121278136785
recall: 0.37622441288268155
fscore: 0.3829147668713699


The __Logistic Regression__ model with the combination of the 8 features scored the following:

| Model | f1score |       
| :- |-------------: |
|Logistic Regression| 0.3829| 

# Eli5 feature weight per discourse type

__Eli5__ is a Python packahe for machine learning that allows to further explain the prediction of a model by explaining weights and showing feature importance.

In [0]:
eli5.show_weights(Lr_model, top=10)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10
+0.967,x45958,,,,,,,,,
+0.935,x66229,,,,,,,,,
+0.301,x38467,,,,,,,,,
+0.299,x39188,,,,,,,,,
+0.298,x43711,,,,,,,,,
+0.258,x45757,,,,,,,,,
+0.111,x37616,,,,,,,,,
+0.110,x41107,,,,,,,,,
… 387 more positive …,… 387 more positive …,,,,,,,,,
… 65834 more negative …,… 65834 more negative …,,,,,,,,,

Weight?,Feature
+0.967,x45958
+0.935,x66229
+0.301,x38467
+0.299,x39188
+0.298,x43711
+0.258,x45757
+0.111,x37616
+0.110,x41107
… 387 more positive …,… 387 more positive …
… 65834 more negative …,… 65834 more negative …

Weight?,Feature
+6.309,x45960
+0.529,x37615
+0.527,x39215
+0.407,x30041
+0.384,x41106
+0.381,x19929
+0.375,x36858
… 40428 more positive …,… 40428 more positive …
… 25793 more negative …,… 25793 more negative …
-1.515,x45959

Weight?,Feature
+4.480,x45960
+1.506,x66229
+0.350,x45962
+0.349,x17807
+0.343,x17798
+0.336,x63817
… 15857 more positive …,… 15857 more positive …
… 50364 more negative …,… 50364 more negative …
-0.347,x30041
-0.417,x37615

Weight?,Feature
+5.274,x45959
+0.335,x45961
+0.220,x12289
… 13907 more positive …,… 13907 more positive …
… 52314 more negative …,… 52314 more negative …
-0.222,x39171
-0.233,x44507
-0.255,x38412
-0.304,x38725
-1.399,x45958

Weight?,Feature
+3.605,x45959
+1.422,x66229
+0.328,x38725
+0.309,x38412
+0.252,x44507
+0.235,x43566
+0.227,x39171
… 6866 more positive …,… 6866 more positive …
… 59355 more negative …,… 59355 more negative …
-0.254,x45961

Weight?,Feature
+2.798,x45958
+0.171,x39215
+0.169,x45984
+0.156,x45961
+0.149,x38454
+0.133,x26640
… 4744 more positive …,… 4744 more positive …
… 61477 more negative …,… 61477 more negative …
-0.173,x45972
-1.096,x45959

Weight?,Feature
+1.650,x45958
+0.857,x66229
+0.169,x37047
+0.166,x38882
+0.162,x37567
+0.160,x17798
… 2725 more positive …,… 2725 more positive …
… 63496 more negative …,… 63496 more negative …
-0.192,x37230
-0.238,x39215

Weight?,Feature
+1.232,x45958
+0.182,x19926
+0.169,x65945
+0.146,x37230
+0.127,x12019
+0.123,x58038
+0.119,x19448
… 1849 more positive …,… 1849 more positive …
… 64372 more negative …,… 64372 more negative …
-0.838,x66229

Weight?,Feature
+0.889,x66229
+0.511,x45958
+0.267,x37768
+0.159,x46345
+0.133,x41418
… 1121 more positive …,… 1121 more positive …
… 65100 more negative …,… 65100 more negative …
-0.126,x19926
-0.132,x38055
-0.251,x39215

Weight?,Feature
+0.117,x37340
+0.117,x22696
+0.117,x39141
+0.117,x20834
… 644 more positive …,… 644 more positive …
… 65577 more negative …,… 65577 more negative …
-0.123,x54710
-0.128,x8691
-0.172,x39215
-0.578,x66229

Weight?,Feature
+0.532,x66229
+0.229,x45958
… 1054 more positive …,… 1054 more positive …
… 65167 more negative …,… 65167 more negative …
-0.112,x19926
-0.119,x45984
-0.147,x37047
-0.164,x45961
-0.169,x54710
-0.173,x8691
