In [1]:
from stop_words import get_stop_words
from nltk.corpus import stopwords
import pandas as pd
import string
import re

In [2]:
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english'))   #About 150 stopwords
stop_words.extend(nltk_words)

## Prepare arguments for domain 1

In [3]:
df_domain_1 = pd.read_csv('domain_1_arguments_raw.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
# shuffle df
df_domain_1 = df_domain_1.sample(frac=1).reset_index(drop=True)
df_domain_1 = df_domain_1[:6050]

In [4]:
df_domain_1

Unnamed: 0,sentence,annotation
0,First setting up a research reactor adjacent t...,argument
1,"January 8 , 2008 : "" The Three mile Island and...",argument
2,"In other words , people experiment with what i...",argument
3,The second side would counter with gun control...,argument
4,Reverting back to nuclear energy is a grave an...,argument
...,...,...
6045,"Furthermore , post-conviction appeals in death...",argument
6046,Yet the American Psychological Association fou...,argument
6047,But whilst the Irish anti-abortion lobbyists b...,argument
6048,"There are tons of arguments against uniforms ,...",argument


## Prepare arguments for domain 2

In [5]:
df_domain_2 = pd.read_csv('domain_2_arguments_raw.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
# shuffle df
df_domain_2 = df_domain_2.sample(frac=1).reset_index(drop=True)

In [6]:
df_domain_2

Unnamed: 0,sentence,annotation
0,it promotes the communication and interaction ...,argument
1,schools and adults should put a sense of coope...,argument
2,traditional education has some merits like fee...,argument
3,people are now able to overcome long distances...,argument
4,"only under specific circumstance, when student...",argument
...,...,...
6045,We all know that Einstein developed the theory...,argument
6046,it is estimated that China has the largest dep...,argument
6047,they will not have money to pay for everything,argument
6048,the tourism has created threatening pressure o...,argument


## Prepare non_arguments from questions

In [7]:
df_questions = pd.read_csv('non_arguments_questions_raw.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
# shuffle df
df_questions = df_questions.sample(frac=1).reset_index(drop=True)
df_questions_1 = df_questions[:1228]
df_questions_2 = df_questions[1228:]

In [8]:
df_questions_1

Unnamed: 0,sentence,annotation
0,What year were the first xylophone artifacts f...,non_argument
1,Were the treaties signed in 1947 and 1948 with...,non_argument
2,How many municipalities are within Oberland?,non_argument
3,For how many years did Charles-Augustin de Cou...,non_argument
4,Where was Isaac Newton born?,non_argument
...,...,...
1223,Give an example of the largest stadiums anywhe...,non_argument
1224,Who did Millard Fillmore fall in love with?,non_argument
1225,What religion do most Uruguayans profess?,non_argument
1226,How many arms does an octopus have?,non_argument


In [9]:
df_questions_2

Unnamed: 0,sentence,annotation
1228,Can turtles take many years to reach breeding ...,non_argument
1229,"When did Baldassare Castiglione, author of Il ...",non_argument
1230,Where do leopards often hide their kills?,non_argument
1231,"In 1820, Avogadro became a professor of physic...",non_argument
1232,Did the Department of the Interior not charge ...,non_argument
...,...,...
2451,When did Lincoln first serve as President?,non_argument
2452,Who helped to fund Roosevelt's African safari?,non_argument
2453,What did The Legal Tender Act of 1862 establish?,non_argument
2454,What does the PRC government classify as liter...,non_argument


## Prepare non_arguments from random sentences

In [10]:
df_random_sents = pd.read_csv('non_arguments_random_raw.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
# shuffle df
df_random_sents = df_random_sents.sample(frac=1).reset_index(drop=True)
df_random_sents_1 = df_random_sents[:4822]
df_random_sents_2 = df_random_sents[4823:9645]

In [11]:
df_random_sents_1

Unnamed: 0,sentence,annotation
0,"Citizens express Gravetye Manor, the home of W...",non_argument
1,Certain words advanced their own country. The ...,non_argument
2,"Unification process, from Judaea via Egypt bet...",non_argument
3,"To puberty, this belief, lasted as late as the...",non_argument
4,"1935 alone, Fossil remains of more moderate co...",non_argument
...,...,...
4817,"One entity rose. In 1935, the regime to such g...",non_argument
4818,Yellowstone offers these taxes are assessed in...,non_argument
4819,Wealthy. \n With tensions mounting between Ger...,non_argument
4820,"Bushmen in rate, controlled inflation and publ...",non_argument


In [12]:
df_random_sents_2

Unnamed: 0,sentence,annotation
4823,Those experiments all candidates in the late 1...,non_argument
4824,Japan \n disproportionately affected by the oc...,non_argument
4825,Violations occur Spanish missionaries began se...,non_argument
4826,With issues exceptions. There is varying and s...,non_argument
4827,(OWL). \n platforms have been situations where...,non_argument
...,...,...
9640,Store water. properties which determine climat...,non_argument
9641,Satellite TV self-governing Dominion to the gr...,non_argument
9642,"Previous scholars, including Lugdunum (present...",non_argument
9643,"Peru, and predict how long it will occur less ...",non_argument


## Concatenate arguments with non_arguments

### Create raw domain 1 dataframe

In [13]:
df_domain_1 = pd.concat([df_domain_1, df_questions_1, df_random_sents_1], ignore_index=True)
# shuffle df
df_domain_1 = df_domain_1.sample(frac=1).reset_index(drop=True)

In [14]:
df_domain_1

Unnamed: 0,sentence,annotation
0,"The model monthly Notizie scritte, which cost ...",non_argument
1,Are trumpets constructed of brass?,non_argument
2,"According to proponents of these policies , un...",argument
3,Cantons are The Abrahamic religions of Judaism...,non_argument
4,"Historians sympathized the Millau Viaduct, the...",non_argument
...,...,...
12095,Cooling those operations often requires a whol...,argument
12096,"Connect with on cheap, off-white paper known a...",non_argument
12097,The continent ) is equal to zero the chemical ...,non_argument
12098,Are meant view Penrose discusses in his long c...,non_argument


In [15]:
df_domain_1['annotation'].value_counts()

non_argument    6050
argument        6050
Name: annotation, dtype: int64

### Create raw domain 2 dataframe

In [16]:
df_domain_2 = pd.concat([df_domain_2, df_questions_2, df_random_sents_2], ignore_index=True)
# shuffle df
df_domain_2 = df_domain_2.sample(frac=1).reset_index(drop=True)

In [17]:
df_domain_2

Unnamed: 0,sentence,annotation
0,Teachers will not find it difficult to make st...,argument
1,Were present of systematic nursing and hospita...,non_argument
2,The distance fact. A form of data being transf...,non_argument
3,"when students finished their high schools, the...",argument
4,Strategic bombing sailors sometimes also other...,non_argument
...,...,...
12095,For all of them we have to ask for a permissio...,argument
12096,children should learn to manage their own mone...,argument
12097,if a low-level employee desire to enhance his ...,argument
12098,Are among his most famous works the proto-Cubi...,non_argument


In [18]:
df_domain_2['annotation'].value_counts()

argument        6050
non_argument    6050
Name: annotation, dtype: int64

## Preprocessing
- delete numbers
- handle apostrophes like in don't -> do not
- delete punctuation
- delete single characters
- delete multiple spaces
- convert to lower case
- tokenization
- stop word removal

### preprocess domain 1 dataframe

In [19]:
sentences = df_domain_1['sentence']
preprocessend_sents = []

for sent in sentences:
    # remove numbers
    sent = ''.join(i for i in sent if not i.isdigit())
    
    # handle apostrophes like in don't -> do not
    sent = sent.replace("n't", " not")
    
    # remove punctuation
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    
    # delete singele characters and 2 character words
    sent = ' '.join([w for w in sent.split() if len(w)>2])
    
    # delete multiple spaces
    sent = re.sub(' +', ' ', sent)
    
    # convert to lower case
    sent = sent.lower()
    
    # tokenization
    word_list = sent.split()
    
    # stop word removal
    word_list = [w for w in word_list if not w in stop_words]
    
    # concatenate list back to string
    sent = ' '.join(word_list)
    
    preprocessend_sents.append(sent)
    
df_domain_1['sentence'] = preprocessend_sents

In [20]:
df_domain_1

Unnamed: 0,sentence,annotation
0,model monthly notizie scritte cost around bill...,non_argument
1,trumpets constructed brass,non_argument
2,according proponents policies uniforms improve...,argument
3,cantons abrahamic religions judaism christiani...,non_argument
4,historians sympathized millau viaduct worlds p...,non_argument
...,...,...
12095,cooling operations often requires whole lot wa...,argument
12096,connect cheap offwhite paper known rhône valle...,non_argument
12097,continent equal zero chemical formula often,non_argument
12098,meant view penrose discusses long career also ...,non_argument


In [21]:
df_domain_1['annotation'].value_counts()

non_argument    6050
argument        6050
Name: annotation, dtype: int64

In [22]:
df_domain_1.to_csv("domain_1_set_clean.csv", sep='\t', encoding='utf-8')

In [23]:
len(df_domain_1)

12100

In [24]:
df_domain_1 = df_domain_1[df_domain_1['sentence'].notna()]

In [25]:
len(df_domain_1)

12100

### preprocess domain 2 dataframe

In [26]:
sentences = df_domain_2['sentence']
preprocessend_sents = []

for sent in sentences:
    # remove numbers
    sent = ''.join(i for i in sent if not i.isdigit())
    
    # handle apostrophes like in don't -> do not
    sent = sent.replace("n't", " not")
    
    # remove punctuation
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    
    # delete singele characters and 2 character words
    sent = ' '.join([w for w in sent.split() if len(w)>2])
    
    # delete multiple spaces
    sent = re.sub(' +', ' ', sent)
    
    # convert to lower case
    sent = sent.lower()
    
    # tokenization
    word_list = sent.split()
    
    # stop word removal
    word_list = [w for w in word_list if not w in stop_words]
    
    # concatenate list back to string
    sent = ' '.join(word_list)
    
    preprocessend_sents.append(sent)
    
df_domain_2['sentence'] = preprocessend_sents

In [27]:
df_domain_2

Unnamed: 0,sentence,annotation
0,teachers find difficult make students learn ne...,argument
1,present systematic nursing hospitals primary p...,non_argument
2,distance fact form data transferred computer,non_argument
3,students finished high schools confused subjec...,argument
4,strategic bombing sailors sometimes also symbo...,non_argument
...,...,...
12095,ask permission time allowed,argument
12096,children learn manage money young age,argument
12097,lowlevel employee desire enhance position fair...,argument
12098,among famous works protocubist les demoiselles...,non_argument


In [28]:
df_domain_2['annotation'].value_counts()

argument        6050
non_argument    6050
Name: annotation, dtype: int64

In [29]:
df_domain_2.to_csv("domain_2_set_clean.csv", sep='\t', encoding='utf-8')

In [30]:
len(df_domain_2)

12100

In [31]:
df_domain_2 = df_domain_2[df_domain_2['sentence'].notna()]

In [32]:
len(df_domain_2)

12100