In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import nltk
import pandas as pd

data = pd.read_csv('/content/gdrive/My Drive/NLP/stance-data.csv')

In [0]:
target_names = ['pro', 'con']
k = 5

In [0]:
topic = "gay rights"
data = data[data.topic=="gay rights"][["post_text","label"]]
np.random.seed(10)
shuffled_data = np.random.permutation(data)

In [0]:
np.shape(shuffled_data)

(1359, 2)

In [0]:
test_size = len(shuffled_data)//k
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))


fold: 0
0 271 (271, 2) (1088, 2)
fold: 1
271 542 (271, 2) (1088, 2)
fold: 2
542 813 (271, 2) (1088, 2)
fold: 3
813 1084 (271, 2) (1088, 2)
fold: 4
1084 1355 (271, 2) (1088, 2)


In [0]:
# Extract features:
from sklearn.feature_extraction.text import CountVectorizer
#t0 = time()
num_features = 100
cv = CountVectorizer(stop_words='english', ngram_range=(1,3), 
                     max_features=num_features)
X_train = cv.fit_transform(train_data[:,0])
X_test = cv.fit_transform(test_data[:,0])

y_train = train_data[:,1]
y_test = test_data[:,1]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.svm import LinearSVC
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range':[(1,1),(1,2),(1,3),(1,4)],
    'tfidf__use_idf':(True,False),
    'clf__alpha':(1e-2,1e-3),
}

In [0]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [0]:
gs_clf = gs_clf.fit(train_data[:,0],train_data[:,1])

In [0]:
gs_clf.best_score_

0.5789593334526382

In [0]:
for param_name in sorted(parameters.keys()):
  print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [0]:
num_features = 10000
def evaluate(clf, train, test):
  cv = CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=num_features)
  trainX = cv.fit_transform(train[:,0])
  testX = cv.fit_transform(test[:,0])
  trainY = train[:,1]
  testY = test[:,1]
  print("X train shape:", np.shape(trainX),"Y train shape", np.shape(trainY),
       "X test shape:", np.shape(testX),"Y test shape", np.shape(testY))
  
  clf.fit(trainX, trainY)
  pred = clf.predict(testX)
  score = metrics.accuracy_score(testY, pred)
  print("accuracy:   %0.3f" % score)
  print("classification report:")
  print(metrics.classification_report(testY, pred,target_names=target_names))
  return score

In [0]:
clf = BernoulliNB(alpha=.01)
test_size = len(shuffled_data)//k
score = 0
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))
  score += evaluate(clf, train_data, test_data)
print(score/5)

fold: 0
(271, 2) (1088, 2)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.635
classification report:
              precision    recall  f1-score   support

         pro       0.45      0.21      0.29        95
         con       0.67      0.86      0.75       176

    accuracy                           0.63       271
   macro avg       0.56      0.54      0.52       271
weighted avg       0.59      0.63      0.59       271

fold: 1
(271, 2) (1088, 2)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.590
classification report:
              precision    recall  f1-score   support

         pro       0.38      0.38      0.38        90
         con       0.69      0.70      0.69       181

    accuracy                           0.59       271
   macro avg       0.54      0.54      0.54       271
weighted avg       0.59      0.59      0.59       271

fold: 2
(271, 

In [0]:
clf = MultinomialNB(alpha=.01)
test_size = len(shuffled_data)//k
score = 0
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))
  score += evaluate(clf, train_data, test_data)
print(score/5)

fold: 0
(271, 2) (1088, 2)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.638
classification report:
              precision    recall  f1-score   support

         pro       0.46      0.20      0.28        95
         con       0.67      0.88      0.76       176

    accuracy                           0.64       271
   macro avg       0.57      0.54      0.52       271
weighted avg       0.60      0.64      0.59       271

fold: 1
(271, 2) (1088, 2)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.587
classification report:
              precision    recall  f1-score   support

         pro       0.39      0.41      0.40        90
         con       0.70      0.67      0.69       181

    accuracy                           0.59       271
   macro avg       0.54      0.54      0.54       271
weighted avg       0.59      0.59      0.59       271

fold: 2
(271, 

In [0]:
clf = BernoulliNB(alpha=.01)
test_size = len(shuffled_data)//k
score = 0
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))
  score += evaluate(clf, train_data, test_data)
print(score/5)

# Test method2

In [0]:
topic = "gay rights"
data = pd.read_csv('/content/gdrive/My Drive/NLP/stance-data.csv')
data = data[data.topic==topic][["post_text","label","word_count","words_pronom",
                                      "words_per_sen","words_over_6","pos_emo","neg_emo"]]
np.random.seed(10)
shuffled_data = np.random.permutation(data)
np.shape(shuffled_data)

(1359, 8)

In [0]:
num_features = 10000
from scipy.sparse import hstack
def evaluate_2(clf, train, test, method2 = False):
  cv = CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=num_features)
  if method2:
    trainX = hstack((cv.fit_transform(train[:,0]), train[:,2:].astype(float)))
    testX = hstack((cv.fit_transform(test[:,0]), test[:,2:].astype(float)))
  else:
    trainX = cv.fit_transform(train[:,0])
    testX = cv.fit_transform(test[:,0])
  trainY = train[:,1]
  testY = test[:,1]
  print("X train shape:", np.shape(trainX),"Y train shape", np.shape(trainY),
       "X test shape:", np.shape(testX),"Y test shape", np.shape(testY))
  
  clf.fit(trainX, trainY)
  pred = clf.predict(testX)
  score = metrics.accuracy_score(testY, pred)
  print("accuracy:   %0.3f" % score)
  print("classification report:")
  print(metrics.classification_report(testY, pred,target_names=target_names))
  return score

In [0]:
clf = BernoulliNB(alpha=.01)
test_size = len(shuffled_data)//k
score_BNB, score_MNB, score_LSV = 0,0,0
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))
  score_BNB += evaluate(BernoulliNB(alpha=.01), train_data, test_data)
  score_MNB += evaluate(MultinomialNB(alpha=.01), train_data, test_data)
  score_LSV += evaluate(LinearSVC(penalty='l1', dual=False,tol=1e-3), train_data, test_data)
print(score_BNB/5, score_MNB/5, score_LSV/5)

fold: 0
(271, 8) (1088, 8)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.635
classification report:
              precision    recall  f1-score   support

         pro       0.45      0.21      0.29        95
         con       0.67      0.86      0.75       176

    accuracy                           0.63       271
   macro avg       0.56      0.54      0.52       271
weighted avg       0.59      0.63      0.59       271

X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.638
classification report:
              precision    recall  f1-score   support

         pro       0.46      0.20      0.28        95
         con       0.67      0.88      0.76       176

    accuracy                           0.64       271
   macro avg       0.57      0.54      0.52       271
weighted avg       0.60      0.64      0.59       271

X train shape: (1088, 10000) Y train shap



accuracy:   0.613
classification report:
              precision    recall  f1-score   support

         pro       0.44      0.35      0.39        96
         con       0.68      0.75      0.72       175

    accuracy                           0.61       271
   macro avg       0.56      0.55      0.55       271
weighted avg       0.60      0.61      0.60       271

fold: 3
(271, 8) (1088, 8)
X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.561
classification report:
              precision    recall  f1-score   support

         pro       0.42      0.23      0.30       109
         con       0.60      0.78      0.68       162

    accuracy                           0.56       271
   macro avg       0.51      0.51      0.49       271
weighted avg       0.53      0.56      0.53       271

X train shape: (1088, 10000) Y train shape (1088,) X test shape: (271, 10000) Y test shape (271,)
accuracy:   0.572
classification report:




In [0]:
clf = BernoulliNB(alpha=.01)
test_size = len(shuffled_data)//k
score_BNB, score_MNB, score_LSV = 0,0,0
for i in range(k):
  print('fold: %d' %i)
  test_data = shuffled_data[i*test_size:(i+1)*test_size]
  train_data = np.vstack((shuffled_data[:i*test_size],shuffled_data[(i+1)*test_size:]))
  print(np.shape(test_data), np.shape(train_data))
  score_LSV += evaluate(LinearSVC(penalty='l1', dual=False,tol=1e-3), train_data, test_data)
print(score_BNB/5, score_MNB/5, score_LSV/5)