# Automatic Classification for Cognitive Engagement in Reddit Discussion Forums using Natural Language Toolkit

The project aims to change the traditional framework of education system and make improvement to teaching methods for better learning. We can distinguish four levels of cognitive engagement:
        
                            Passive, Active, Constructive, Interactive

## Data Loading

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
comments_df = pd.read_csv('Submissions.csv')
comments_df.head(5)

Unnamed: 0,id,kind,category,created_utc,author,author_fullname,name,subreddit_id,subreddit_subscriber,subreddit,title,selftext,upvote_ratio,url,num_comments,ups,downs,total_awards_received,score,created,num_crossposts
0,s2npif,t3,,1642040007,rogmexico,t2_16qf7m,t3_s2npif,t5_2sptq,659490,datascience,"How much of your workload is ""assigned"" to you...",**TLDR my questions:**\n\n* **How much of your...,1.0,https://www.reddit.com/r/datascience/comments/...,1,2,0,0,2,1642040007,0
1,s2nlf6,t3,,1642039703,bikeskata,t2_c7y1n44w,t3_s2nlf6,t5_2sptq,659490,datascience,An approachable introduction to the Bayesian o...,,1.0,https://solomonkurz.netlify.app/post/2021-12-2...,0,2,0,0,2,1642039703,0
2,s2m7g3,t3,,1642035730,i_am_baldilocks,t2_aewcc,t3_s2m7g3,t5_2sptq,659490,datascience,Finding Part-Time DS Work,"Hey guys,\n\nDoes anyone know how to find part...",0.83,https://www.reddit.com/r/datascience/comments/...,8,11,0,0,11,1642035730,0
3,s2l23y,t3,,1642032331,DoctorQuinlan,t2_y5s32,t3_s2l23y,t5_2sptq,659490,datascience,Data Science vs Data Engineer jobs/salary/expe...,"So I currently work as a DB programmer, which ...",0.83,https://www.reddit.com/r/datascience/comments/...,11,8,0,0,8,1642032331,0
4,s2i9br,t3,,1642024923,toomaime,t2_o4jqp,t3_s2i9br,t5_2sptq,659490,datascience,Sports Analytics company Hudl is looking for a...,,0.67,https://sportekjobs.com/data-scientist-applied...,0,1,0,0,1,1642024923,0


In [9]:
comments_df = pd.read_csv('CommentStreams.csv')
comments_df.head(5)

Unnamed: 0,comment_id,author,subreddit,retrieved_on,comment_text,link_id,parent_id,permalink,send_replies
0,"('h91cqc4',)","('Street-Spot5011',)","('AskReddit',)","(1629105358,)","('Personally, I’d talk to him abt it in a CIVI...","('t3_p4u4i2',)","('t3_p4u4i2',)",('/r/AskReddit/comments/p4u4i2/if_you_saw_some...,True
1,"('h91cufe',)","('Puechamp',)","('AskReddit',)","(1629105416,)",('Slap his face real hard. Never burn a book i...,"('t3_p4u4i2',)","('t3_p4u4i2',)",('/r/AskReddit/comments/p4u4i2/if_you_saw_some...,True
2,"('h91cvff',)","('4AcidRayne',)","('AskReddit',)","(1629105429,)","('Add that person to my mental ""let\'s keep an...","('t3_p4u4i2',)","('t3_p4u4i2',)",('/r/AskReddit/comments/p4u4i2/if_you_saw_some...,True
3,"('h91cwch',)","('hate_most_of_you',)","('AskReddit',)","(1629105442,)","(""do notn'tn't"",)","('t3_p4u4i2',)","('t1_h91caod',)",('/r/AskReddit/comments/p4u4i2/if_you_saw_some...,True
4,"('h91cxbj',)","('JacksonBoyd12',)","('AskReddit',)","(1629105455,)",('Cheer them on and give them another Bible to...,"('t3_p4u4i2',)","('t3_p4u4i2',)",('/r/AskReddit/comments/p4u4i2/if_you_saw_some...,True


# Exploratory Data Analysis

In [6]:
import pandas as pd

left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub2','sub2','sub6','sub5']})
print(pd.merge(left, right, on='subject_id', how='left'))

   id_x  Name_x subject_id  id_y Name_y
0     1    Alex       sub1   NaN    NaN
1     2     Amy       sub2   1.0  Billy
2     2     Amy       sub2   2.0  Brian
3     2     Amy       sub2   3.0   Bran
4     3   Allen       sub4   NaN    NaN
5     4   Alice       sub6   4.0  Bryce
6     5  Ayoung       sub5   5.0  Betty


# Feature Engineering

## Discussion Context Features

In [None]:
from textblob import TextBlob



## Doc2Vec

The main objective of doc2vec is to convert sentence or paragraph to vector (numeric) form. In Natural Language Processing Doc2Vec is used to find related sentences for a given sentence (instead of word in Word2Vec).

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

## Exapmple document (list of sentences)
doc = ["I love data science",
        "I love coding in python",
        "I love building NLP tool",
        "This is a good phone",
        "This is a good TV",
        "This is a good laptop"]

# Tokenization of each document
tokenized_doc = []
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))
tokenized_doc

[['i', 'love', 'data', 'science'],
 ['i', 'love', 'coding', 'in', 'python'],
 ['i', 'love', 'building', 'nlp', 'tool'],
 ['this', 'is', 'a', 'good', 'phone'],
 ['this', 'is', 'a', 'good', 'tv'],
 ['this', 'is', 'a', 'good', 'laptop']]

In [2]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data

[TaggedDocument(words=['i', 'love', 'data', 'science'], tags=[0]),
 TaggedDocument(words=['i', 'love', 'coding', 'in', 'python'], tags=[1]),
 TaggedDocument(words=['i', 'love', 'building', 'nlp', 'tool'], tags=[2]),
 TaggedDocument(words=['this', 'is', 'a', 'good', 'phone'], tags=[3]),
 TaggedDocument(words=['this', 'is', 'a', 'good', 'tv'], tags=[4]),
 TaggedDocument(words=['this', 'is', 'a', 'good', 'laptop'], tags=[5])]

In [12]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model.save("test_doc2vec.model")
## Load saved doc2vec model
model= Doc2Vec.load("test_doc2vec.model")
## Print model vocabulary
model.wv.index_to_key

['i',
 'love',
 'good',
 'a',
 'is',
 'this',
 'in',
 'data',
 'science',
 'coding',
 'laptop',
 'python',
 'building',
 'tv',
 'tool',
 'phone',
 'nlp']

In [18]:
# find most similar doc 
test_doc = word_tokenize("That is a good computer".lower())
model.dv.most_similar(positive=[model.infer_vector(test_doc)],topn=5)

[(2, 0.5019209980964661),
 (4, 0.47509756684303284),
 (1, 0.42233988642692566),
 (3, 0.2952742278575897),
 (0, 0.18391263484954834)]

## Data Labelling

Natural language processing requires you to first manually identify important sections of text or tag the text with specific labels to generate your training dataset. For example, you may want to identify the sentiment or intent of a text blurb, identify parts of speech, classify proper nouns like places and people, and identify text in images, PDFs, or other files. To do this, you can draw bounding boxes around text and then manually transcribe the text in your training dataset. Natural language processing models are used for sentiment analysis, entity name recognition, and optical character recognition.

## Data Preprocessing

In [19]:
comments_df['comment_id'] = comments_df['comment_id'].str.strip('(\'\,)').astype(str)
comments_df['author'] = comments_df['author'].str.strip('(\'\,)').astype(str)
comments_df['subreddit'] = comments_df['subreddit'].str.strip('(\'\,)').astype(str)
comments_df['retrieved_on'] = comments_df['retrieved_on'].str.strip('(\'\,)').astype(str)
comments_df['comment_text'] = comments_df['comment_text'].str.strip('(\'\,)').astype(str)
comments_df['link_id'] = comments_df['link_id'].str.strip('(\'\,)').astype(str)
comments_df['parent_id'] = comments_df['parent_id'].str.strip('(\'\,)').astype(str)
comments_df['permalink'] = comments_df['permalink'].str.strip('(\'\,)').astype(str)

comments_df.head()

Unnamed: 0,comment_id,author,subreddit,retrieved_on,comment_text,link_id,parent_id,permalink,send_replies
0,h91cqc4,Street-Spot5011,AskReddit,1629105358,"Personally, I’d talk to him abt it in a CIVIL ...",t3_p4u4i2,t3_p4u4i2,/r/AskReddit/comments/p4u4i2/if_you_saw_someon...,True
1,h91cufe,Puechamp,AskReddit,1629105416,Slap his face real hard. Never burn a book in ...,t3_p4u4i2,t3_p4u4i2,/r/AskReddit/comments/p4u4i2/if_you_saw_someon...,True
2,h91cvff,4AcidRayne,AskReddit,1629105429,"Add that person to my mental ""let\'s keep an e...",t3_p4u4i2,t3_p4u4i2,/r/AskReddit/comments/p4u4i2/if_you_saw_someon...,True
3,h91cwch,hate_most_of_you,AskReddit,1629105442,"""do notn'tn't""",t3_p4u4i2,t1_h91caod,/r/AskReddit/comments/p4u4i2/if_you_saw_someon...,True
4,h91cxbj,JacksonBoyd12,AskReddit,1629105455,Cheer them on and give them another Bible to burn,t3_p4u4i2,t3_p4u4i2,/r/AskReddit/comments/p4u4i2/if_you_saw_someon...,True


In [None]:
df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())

In [1]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [2]:
sources = {'TE.txt':'TE', 'cs.txt':'EX'}

sentences = LabeledLineSentence(sources)

In [10]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random

In [11]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

TypeError: __init__() got an unexpected keyword argument 'size'