In [1]:
import pandas as pd
import spacy
from string import punctuation
from spacy.lang.en import English
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('Preprocessed_produced_new.csv')

In [3]:
eng = spacy.load('en')

stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] + stopwords.words('english')

parser = English()
lemmatizer = WordNetLemmatizer()

In [4]:
def word_tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        token_str = str(token)
        if token.orth_.isspace():
            continue
        elif str(token) not in stop_words:
            line_tokens.append(lemmatizer.lemmatize(token.lower_))
    return line_tokens

In [5]:
df_non_argumentative = df.loc[df['is_non_argumentative'] == True]
df_argumentative = df.loc[df['is_non_argumentative'] == False]

df_against = df_argumentative.loc[df_argumentative['is_against'] == True]
df_support = df_argumentative.loc[df_argumentative['is_against'] == False]

df_claim = df_argumentative.loc[df_argumentative['argu_part'] == 1]
df_warrant = df_argumentative.loc[df_argumentative['argu_part'] == 2]
df_ground = df_argumentative.loc[df_argumentative['argu_part'] == 3]

In [6]:
non_argumentative_str = df_non_argumentative["Text Content"].str.cat(sep=' ')
argumentative_str = df_argumentative["Text Content"].str.cat(sep=' ') 
against_str = df_against["Text Content"].str.cat(sep=' ') 
support_str = df_support["Text Content"].str.cat(sep=' ') 
claim_str = df_claim["Text Content"].str.cat(sep=' ') 
warrant_str = df_warrant["Text Content"].str.cat(sep=' ') 
ground_str = df_ground["Text Content"].str.cat(sep=' ') 

In [7]:
display(len(non_argumentative_str))
display(len(argumentative_str))
display(len(against_str))
display(len(support_str))
display(len(claim_str))
display(len(warrant_str))
display(len(ground_str))

175213

289775

72877

216897

57443

182322

50008

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfer = TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2), analyzer='word', norm='l2')
tfidf_result = tfidfer.fit_transform([non_argumentative_str, argumentative_str, against_str, support_str, claim_str, warrant_str, ground_str ])

In [9]:
non_argumentative_tfidf = tfidf_result.toarray()[0]
argumentative_tfidf = tfidf_result.toarray()[1]
against_tfidf = tfidf_result.toarray()[2]
support_tfidf = tfidf_result.toarray()[3]
claim_tfidf = tfidf_result.toarray()[4]
warrant_tfidf = tfidf_result.toarray()[5]
ground_tfidf = tfidf_result.toarray()[6]

In [10]:
nlarge_idx_non_argumentative = non_argumentative_tfidf.argsort()[-20:][::-1]
nlarge_idx_argumentative = argumentative_tfidf.argsort()[-20:][::-1]
nlarge_idx_against = against_tfidf.argsort()[-20:][::-1]
nlarge_idx_support = support_tfidf.argsort()[-20:][::-1]
nlarge_idx_claim = claim_tfidf.argsort()[-20:][::-1]
nlarge_idx_warrant = warrant_tfidf.argsort()[-20:][::-1]
nlarge_idx_ground = ground_tfidf.argsort()[-20:][::-1]

In [11]:
feature_names = tfidfer.get_feature_names()

In [12]:
len(feature_names)

38448

# 1. non-argumentative

In [13]:
for idx in nlarge_idx_non_argumentative:
    print(feature_names[idx] + ": " + str(non_argumentative_tfidf[idx]))

screen_name: 0.346717800899
tab: 0.226493671795
plus_one: 0.200766437866
file: 0.174969045036
would: 0.159941028898
issue: 0.152427020829
feature: 0.15028016138
thanks: 0.149904831072
plus_one plus_one: 0.148568119272
code_segment: 0.140619293863
folder: 0.135252145242
editor: 0.130958426346
work: 0.130958426346
open: 0.128811566897
like: 0.115930410208
one: 0.107342972415
comment: 0.0987555346214
workspace: 0.0987555346214
vscode: 0.0987555346214
code: 0.0955352454489


# 2. Argumentative

In [14]:
for idx in nlarge_idx_argumentative:
    print(feature_names[idx] + ": " + str(argumentative_tfidf[idx]))

file: 0.394989105252
tab: 0.351768412884
folder: 0.24311639457
would: 0.20409771396
project: 0.198695127414
open: 0.184888517352
editor: 0.163278171168
like: 0.152472998076
code: 0.148270986318
one: 0.140467250196
working: 0.129662077104
use: 0.1200574788
vscode: 0.11105316789
want: 0.110452880496
code_segment: 0.104450006556
feature: 0.104450006556
window: 0.101448569586
workspace: 0.0966462704339
work: 0.0948454082519
setting: 0.091243683888


# 3. Claim

In [15]:
for idx in nlarge_idx_claim:
    print(feature_names[idx] + ": " + str(claim_tfidf[idx]))

tab: 0.364088821694
would: 0.319506516997
file: 0.292261775237
folder: 0.245202675835
like: 0.193189987021
project: 0.168422039967
please: 0.15603806644
open: 0.141177298208
feature: 0.121362940565
code_segment: 0.116409351154
one: 0.116409351154
screen_name: 0.111455761743
root: 0.108978967038
add: 0.106502172332
setting: 0.106502172332
workspace: 0.101548582921
option: 0.099071788216
make: 0.0916414040998
window: 0.0916414040998
think: 0.0891646093944


# 4. Warrant

In [16]:
for idx in nlarge_idx_warrant:
    print(feature_names[idx] + ": " + str(warrant_tfidf[idx]))

file: 0.371186438869
tab: 0.354536915225
folder: 0.198814899974
would: 0.18706229505
project: 0.172371538894
editor: 0.169433387663
code: 0.167474620176
like: 0.156701398995
open: 0.149845712789
one: 0.139072491608
want: 0.138093107864
use: 0.136134340377
working: 0.134175572889
vscode: 0.121443584221
feature: 0.111649746784
work: 0.10283529309
think: 0.100876525603
v: 0.0989177581154
need: 0.0979383743717
people: 0.0959796068842


# 5. Ground

In [17]:
for idx in nlarge_idx_ground:
    print(feature_names[idx] + ": " + str(ground_tfidf[idx]))

file: 0.439988281291
folder: 0.281483186167
open: 0.267818953829
project: 0.237757642685
tab: 0.210429178009
editor: 0.177635020397
code: 0.163970788059
sublime: 0.15030655572
working: 0.133909476915
one: 0.122978091044
use: 0.114779551641
atom: 0.112046705173
window: 0.101115319303
vscode: 0.0983824728352
v: 0.0929167798999
code_segment: 0.0901839334322
work: 0.0874510869646
multiple: 0.0847182404969
git: 0.0819853940293
time: 0.0792525475617


# 6. Support

In [18]:
for idx in nlarge_idx_support:
    print(feature_names[idx] + ": " + str(support_tfidf[idx]))
    

file: 0.343190192665
tab: 0.339190306969
folder: 0.25199279881
project: 0.224793576081
would: 0.220793690386
open: 0.175994970597
like: 0.167195222067
editor: 0.166395244928
code: 0.156795519259
one: 0.143195907895
vscode: 0.127996342253
use: 0.125596410835
feature: 0.123996456557
window: 0.115196708027
want: 0.110396845193
working: 0.108796890915
work: 0.0999971423848
v: 0.0951972795503
need: 0.0935973252722
multiple: 0.091997370994


# 7. Against

In [19]:
for idx in nlarge_idx_against:
    print(feature_names[idx] + ": " + str(against_tfidf[idx]))

file: 0.488761317859
tab: 0.345761281629
folder: 0.192089600905
open: 0.187820943107
working: 0.170746311916
code_segment: 0.149403022926
would: 0.136597049533
editor: 0.136597049533
workspace: 0.132328391735
working file: 0.125925405038
setting: 0.123791076139
one: 0.117388089442
code: 0.108850773846
project: 0.106716444947
want: 0.0981791293515
like: 0.0960448004525
use: 0.0917761426547
make: 0.0853731559578
user: 0.0853731559578
thing: 0.0811044981599
