In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
# from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Handle nan sentences

In [2]:
domain_1 = pd.read_csv('domain_1_set_clean.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
domain_2 = pd.read_csv('domain_2_set_clean.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])

In [3]:
domain_1

Unnamed: 0,sentence,annotation
0,model monthly notizie scritte cost around bill...,non_argument
1,trumpets constructed brass,non_argument
2,according proponents policies uniforms improve...,argument
3,cantons abrahamic religions judaism christiani...,non_argument
4,historians sympathized millau viaduct worlds p...,non_argument
...,...,...
12095,cooling operations often requires whole lot wa...,argument
12096,connect cheap offwhite paper known rhône valle...,non_argument
12097,continent equal zero chemical formula often,non_argument
12098,meant view penrose discusses long career also ...,non_argument


In [4]:
# drop nan sentences
domain_1 = domain_1[domain_1['sentence'].notna()]

In [5]:
# drop two arguments and two non-arguments
domain_1 = domain_1.drop(domain_1.index[[0,1,2,12095]])

In [6]:
domain_1["annotation"].value_counts()

non_argument    6048
argument        6048
Name: annotation, dtype: int64

In [7]:
len(domain_1)

12096

In [8]:
# drop nan sentences
domain_2 = domain_2[domain_2['sentence'].notna()]

In [9]:
# drop two arguments
domain_2 = domain_2.drop(domain_2.index[[0,3]])

In [10]:
domain_2["annotation"].value_counts()

non_argument    6048
argument        6048
Name: annotation, dtype: int64

In [11]:
len(domain_2)

12096

### Baseline

In [12]:
X_1 = domain_1['sentence']
y_1 = domain_1['annotation']

X_2 = domain_2['sentence']
y_2 = domain_2['annotation']

In [13]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
X_1 = vec.fit_transform(X_1).toarray()
X_2 = vec.fit_transform(X_2).toarray()

In [14]:
print(X_1.shape)
print(X_2.shape)

print(y_1.shape)
print(y_2.shape)

(12096, 18638)
(12096, 15663)
(12096,)
(12096,)


In [15]:
length_1 = len(X_1[0])
length_2 = len(X_2[0])

In [16]:
X_2 = np.pad(X_2,((0, 0), (0, abs(length_1 - length_2))), mode = 'constant', constant_values=9)
# mode='constant', constant_values=0

In [17]:
print(X_1.shape)
print(X_2.shape)

print(y_1.shape)
print(y_2.shape)

(12096, 18638)
(12096, 18638)
(12096,)
(12096,)


### Train on Domain 1 and test on Domain 2

In [18]:
# training the model on training set from domain 1
model = MultinomialNB()
model.fit(X_1, y_1)

MultinomialNB()

In [19]:
# making predictions on the testing set from domain 2
y_pred = model.predict(X_2)

In [20]:
# compare predictions with true labels from domain 2
print("Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_2, y_pred)*100)

Naive Bayes model accuracy(in %): 50.0


In [21]:
print(confusion_matrix(y_2, y_pred))

[[   0 6048]
 [   0 6048]]


In [22]:
print(classification_report(y_2, y_pred))

              precision    recall  f1-score   support

    argument       0.00      0.00      0.00      6048
non_argument       0.50      1.00      0.67      6048

    accuracy                           0.50     12096
   macro avg       0.25      0.50      0.33     12096
weighted avg       0.25      0.50      0.33     12096



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Train on Domain 2 and test on Domain 1

In [23]:
# training the model on training set from domain 2
model = MultinomialNB()
model.fit(X_2, y_2)

MultinomialNB()

In [24]:
# making predictions on the testing set from domain 2
y_pred = model.predict(X_1)

In [25]:
# compare predictions with true labels from domain 1
print("Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_1, y_pred)*100)

Naive Bayes model accuracy(in %): 46.676587301587304


In [26]:
print(confusion_matrix(y_1, y_pred))

[[1064 4984]
 [1466 4582]]


In [27]:
print(classification_report(y_1, y_pred))

              precision    recall  f1-score   support

    argument       0.42      0.18      0.25      6048
non_argument       0.48      0.76      0.59      6048

    accuracy                           0.47     12096
   macro avg       0.45      0.47      0.42     12096
weighted avg       0.45      0.47      0.42     12096

