In [0]:
import numpy as np
import pandas as pd
np.random.seed(4705)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack, vstack

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def load_data(filename, topic):
	raw = pd.read_csv(filename)
	print("Loading %s with %d records" %(filename,len(raw)))
	print("Column names: ", raw.columns)

	selected = raw[raw.topic == topic]
	selected = np.random.permutation(selected)
	return selected

In [91]:
run(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.6083 and f1 score is 0.61.


In [104]:
run(data,(3000,1e-3), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.5491 and f1 score is 0.55.


In [111]:
run(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.6106 and f1 score is 0.61.


In [0]:
MySearch(data, NB=False, method=1)

In [113]:
run(data,(50, 'squared_hinge', 1, 3000, None), method=1, NB=False, search=False)



The averaged accuracy score of 5-fold cv is 0.6118 and f1 score is 0.61.




In [117]:
runold(data,(500, 'hinge', 1, 1000, None), method=1, NB=False, search=False)



The averaged accuracy score of 5-fold cv is 0.5930 and f1 score is 0.59.




In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction import stop_words
def run(selected, paras, method=1, NB=True, search=False):
  kfolds = 5
  score,f1 = 0,0

  cv = CountVectorizer(stop_words='english', ngram_range=(1,3))#,token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
  exf_k = paras[0]
  kBest = SelectKBest(chi2, k=exf_k)
  if NB:
    clf = MultinomialNB(alpha=paras[1])
  else:
    clf = LinearSVC(loss=paras[1],C=paras[2],max_iter=paras[3],class_weight=paras[4])

  Y = selected[:,3]
  if method==1:
    X = selected[:,0]
  skf = StratifiedKFold(n_splits=kfolds)#random_state=None, shuffle=False
  for train_index, test_index in skf.split(X, Y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    trainX, testX = X[train_index], X[test_index]
    trainY, testY = Y[train_index], Y[test_index]

    trainX = cv.fit_transform(trainX)
    testX = cv.transform(testX)
    trainX = kBest.fit_transform(trainX, trainY)
    testX = kBest.transform(testX)

    clf.fit(trainX, trainY)
    pred = clf.predict(testX)
    score += metrics.accuracy_score(testY, pred)
    #average in f1 could be any of 'macro','micro','weighted'
    f1 += metrics.f1_score(testY, pred, average='micro')
  if not search:
    print("The averaged accuracy score of 5-fold cv is %0.4f and f1 score is %0.2f." %(score/5, f1/5))
  else:
    return score/5

In [0]:
def MySearch(selected, NB=True, method=2):
	params_NB = {
		'exf_k':(20,50,100,500,1000,2000),
		'clf_alpha':(1,0.5,1e-1,1e-2,1e-3),

	}
	comb_NB = []
	for i in range(len(params_NB['exf_k'])):
		for j in range(len(params_NB['clf_alpha'])):
			comb_NB.append((params_NB['exf_k'][i], params_NB['clf_alpha'][j]))
	#print(comb_NB[:6])
	params_SVM = {
    	'exf_k':[50,100,500,1000],
		'clf_loss':['hinge','squared_hinge'],
		'clf_C': [1, 10, 50,100],
		'clf_max_iter':[1000,2000,3000],
		'clf_class_weight':[None,'balanced']
	}
	comb_SVM = []
	for i in params_SVM['exf_k']:
		for j in params_SVM['clf_loss']:
			for l in params_SVM['clf_C']:
				for m in params_SVM['clf_max_iter']:
					for t in params_SVM['clf_class_weight']:
						comb_SVM.append((i,j,l,m,t))

	best_score = 0
	if NB:
		comb_paras = comb_NB
	else:
		comb_paras = comb_SVM
	for ipar in comb_paras:
		tmp = run(selected, ipar, method=method,NB=NB,search=True)
		print("cross validation with paras:", ipar, ", score:", tmp)
		if tmp > best_score:
			best_score = tmp
			best_paras = ipar
	return best_score, best_paras

In [22]:
data = load_data('/content/gdrive/My Drive/NLP/stance-data.csv',"abortion")

Loading /content/gdrive/My Drive/NLP/stance-data.csv with 4649 records
Column names:  Index(['post_text', 'topic', 'author', 'label', 'id', 'word_count',
       'words_pronom', 'words_per_sen', 'words_over_6', 'pos_emo', 'neg_emo',
       'count_noun', 'count_verb', 'count_adj'],
      dtype='object')


In [12]:
tmpX = data[:,0]
print(tmpX[:2])
tmpY = data[:,3]
print(tmpY[:2])

["Why should the government have any say in how people run thier lives ? I might not ever know the answer to that but i know why the government does . Becuz even thought we do n't say it out loud we think like ing some one of the same sex deserves a title . WELL IT DOES N'T ! same sex relationships are the same as different sex relationships . I do n't know first hand but i would think that a same sex relationship would be better then a different sex relationship just becuz the SS relationship would mean they new more about how that body type works . BUT the government does'nt work that why all they see is since it is not the same we do n't like it . and it is so fucked up becuz it is like the the gov . is yelling `` FUCK YOU FAGOTS AMERICA HATES YOU AND YOUR WAYS '' and that pisses me the hell off ! i think that if you wan na fuck some one with the same gear have fun ! just do n't try it on me !"
 "If you 're looking at it religiously then religious attitudes should be modified to ref

In [0]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,3),token_pattern='[A-Za-z]+')
tmpc = cv.fit_transform(tmpX,tmpY)

In [53]:
np.asarray(tmpc)

array(<1359x108645 sparse matrix of type '<class 'numpy.int64'>'
	with 178070 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [0]:
cv.get_feature_names()

In [0]:
def runold(selected, paras, method=1, NB=True, search=False):
	kfolds = 5
	test_size = len(selected)//kfolds
	score,f1 = 0,0

	cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
	exf_k = paras[0]
	kBest = SelectKBest(chi2, k=exf_k)
	if NB:
		clf = MultinomialNB(alpha=paras[1])
	else:
		clf = LinearSVC(loss=paras[1],C=paras[2],max_iter=paras[3],class_weight=paras[4])

	for i in range(kfolds):
		test = selected[i*test_size:(i+1)*test_size]
		train = np.vstack((selected[:i*test_size,:],selected[(i+1)*test_size:,:]))

		trainY = train[:,3]
		testY = test[:,3]
		trainX = cv.fit_transform(train[:,0])
		testX = cv.transform(test[:,0])
		if method == 2:
			#tfidf = TfidfTransformer()
			#trainX = tfidf.fit_transform(trainX)
			trainX = hstack((trainX, train[:,6:9].astype(float)))
			#testX = tfidf.transform(testX)
			testX = hstack((testX, test[:,6:9].astype(float)))
		trainX = kBest.fit_transform(trainX, trainY)
		testX = kBest.transform(testX)

		clf.fit(trainX, trainY)
		pred = clf.predict(testX)
		score += metrics.accuracy_score(testY, pred)
		#average in f1 could be any of 'macro','micro','weighted'
		f1 += metrics.f1_score(testY, pred, average='micro')
	if not search:
		print("The averaged accuracy score of 5-fold cv is %0.4f and f1 score is %0.2f." %(score/5, f1/5))
	else:
		return score/5

In [90]:
run(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.6083 and f1 score is 0.61.


In [61]:
run(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.5977 and f1 score is 0.60.


In [41]:
run(data,(500,0.1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.5918 and f1 score is 0.59.


In [40]:
runold(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.6076 and f1 score is 0.61.


In [87]:
run(data,(500,1), method=1, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.6106 and f1 score is 0.61.


In [86]:
MySearch(data, NB=True, method=1)

cross validation with paras: (20, 1) , score: 0.592509132067706
cross validation with paras: (20, 0.5) , score: 0.592509132067706
cross validation with paras: (20, 0.1) , score: 0.592509132067706
cross validation with paras: (20, 0.01) , score: 0.592509132067706
cross validation with paras: (20, 0.001) , score: 0.592509132067706
cross validation with paras: (50, 1) , score: 0.6012913515460204
cross validation with paras: (50, 0.5) , score: 0.6012913515460204
cross validation with paras: (50, 0.1) , score: 0.600706556224383
cross validation with paras: (50, 0.01) , score: 0.6012913515460204
cross validation with paras: (50, 0.001) , score: 0.6007048412820908
cross validation with paras: (100, 1) , score: 0.6007031263397986
cross validation with paras: (100, 0.5) , score: 0.5983622301109567
cross validation with paras: (100, 0.1) , score: 0.598945310490302
cross validation with paras: (100, 0.01) , score: 0.5977740049047349
cross validation with paras: (100, 0.001) , score: 0.59777571984

(0.6106497916345115, (500, 1))

In [76]:
MySearch(data, NB=True, method=1)#with number delete+stratified

cross validation with paras: (20, 1) , score: 0.5948483133542556
cross validation with paras: (20, 0.5) , score: 0.594261803090326
cross validation with paras: (20, 0.1) , score: 0.5936752928263964
cross validation with paras: (20, 0.01) , score: 0.5936752928263964
cross validation with paras: (20, 0.001) , score: 0.5936752928263964
cross validation with paras: (50, 1) , score: 0.5942720927440791
cross validation with paras: (50, 0.5) , score: 0.5942720927440791
cross validation with paras: (50, 0.1) , score: 0.5942720927440791
cross validation with paras: (50, 0.01) , score: 0.5931007871585121
cross validation with paras: (50, 0.001) , score: 0.5931007871585121
cross validation with paras: (100, 1) , score: 0.6094904906449898
cross validation with paras: (100, 0.5) , score: 0.605390063624359
cross validation with paras: (100, 0.1) , score: 0.6036305328325702
cross validation with paras: (100, 0.01) , score: 0.6007048412820908
cross validation with paras: (100, 0.001) , score: 0.600704

(0.6094904906449898, (100, 1))

In [0]:
from sklearn.model_selection import StratifiedKFold
def run3(selected, paras, method=3, NB=True, search=False):
  kfolds = 5
  score,f1 = 0,0

  cv = CountVectorizer(stop_words='english', ngram_range=(1,3),token_pattern='[A-Za-z]+')
  exf_k = paras[0]
  kBest = SelectKBest(chi2, k=exf_k)
  if NB:
    clf = MultinomialNB(alpha=paras[1])
  else:
    clf = LinearSVC(loss=paras[1],C=paras[2],max_iter=paras[3],class_weight=paras[4])

  Y = selected[:,3]
  if method==3:
    X = selected[:,5:11]
  skf = StratifiedKFold(n_splits=kfolds)#random_state=None, shuffle=False
  for train_index, test_index in skf.split(X, Y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    trainX, testX = X[train_index], X[test_index]
    trainY, testY = Y[train_index], Y[test_index]

    clf.fit(trainX, trainY)
    pred = clf.predict(testX)
    score += metrics.accuracy_score(testY, pred)
    #average in f1 could be any of 'macro','micro','weighted'
    f1 += metrics.f1_score(testY, pred, average='micro')
  if not search:
    print("The averaged accuracy score of 5-fold cv is %0.4f and f1 score is %0.2f." %(score/5, f1/5))
  else:
    return score/5

In [82]:
run3(data, (500,1), method=3, NB=True, search=False)

The averaged accuracy score of 5-fold cv is 0.4953 and f1 score is 0.50.


In [79]:
data[:,1]

array(['abortion', 'abortion', 'abortion', ..., 'abortion', 'abortion',
       'abortion'], dtype=object)

## test downsample

In [110]:
class0 = data[np.where(data[:,3]=='con')]
class1 = data[np.where(data[:,3]=='pro')]
print(np.shape(class0),np.shape(class1))
if len(class0) > len(class1):
  class0 = np.random.choice(class0, size=len(class1), replace=False)
else:
  class1 = np.random.choice(class1, size=len(class0), replace=False)
print(np.shape(class0),np.shape(class1))
selected = np.hstack((class0, class1))
print(np.shape(selected))

(763, 14) (945, 14)


ValueError: ignored

In [0]:
from sklearn.model_selection import KFold
def run_downsample(selected, paras, method=1, NB=True, search=False):
  kfolds = 5
  score,f1 = 0,0

  cv = CountVectorizer(stop_words='english', ngram_range=(1,3))#,token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
  exf_k = paras[0]
  kBest = SelectKBest(chi2, k=exf_k)
  if NB:
    clf = MultinomialNB(alpha=paras[1])
  else:
    clf = LinearSVC(loss=paras[1],C=paras[2],max_iter=paras[3],class_weight=paras[4])

  class0 = selected[np.where(selected[:,3]=='con')]
  class1 = selected[np.where(selected[:,3]=='pro')]
  if len(class0) > len(class1):
    class0 = np.random.choice(class0, size=len(class1), replace=False)
  else:
    class1 = np.random.choice(class1, size=len(class0), replace=False)
  selected = np.hstack((class0, class1))
    
  Y = selected[:,3]
  if method==1:
    X = selected[:,0]
  sf = KFold(n_splits=kfolds)#random_state=None, shuffle=False
  for train_index, test_index in skf.split(X, Y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    trainX, testX = X[train_index], X[test_index]
    trainY, testY = Y[train_index], Y[test_index]

    trainX = cv.fit_transform(trainX)
    testX = cv.transform(testX)
    trainX = kBest.fit_transform(trainX, trainY)
    testX = kBest.transform(testX)

    clf.fit(trainX, trainY)
    pred = clf.predict(testX)
    score += metrics.accuracy_score(testY, pred)
    #average in f1 could be any of 'macro','micro','weighted'
    f1 += metrics.f1_score(testY, pred, average='micro')
  if not search:
    print("The averaged accuracy score of 5-fold cv is %0.4f and f1 score is %0.2f." %(score/5, f1/5))
  else:
    return score/5

# find top20

In [0]:
def top20(dataX, dataY):
	cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
	dataX = cv.fit_transform(dataX)
	names = cv.get_feature_names()
	kBest = SelectKBest(chi2, k=20)
	kBest.fit_transform(dataX, dataY)
	k_feature_index = kBest.get_support(indices=True)

	res = []
	for i in k_feature_index:
		res.append(names[i])
	print(len(k_feature_index),len(res))
	return res

In [131]:
top20(data[:,0],data[:,3])

20 20


['abortionist',
 'awareness',
 'begins conception',
 'choice',
 'conception',
 'consciousness',
 'foetus',
 'force',
 'human',
 'illegal',
 'kill',
 'life begins conception',
 'living human',
 'potential',
 'potential life',
 'pregnancy',
 'society',
 'survive outside',
 'unto',
 've']

In [0]:
def run_best(selected, paras, method=1, NB=False, example=False):
  kfolds = 5
  score,f1 = 0,0

  cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
  exf_k = paras[0]
  kBest = SelectKBest(chi2, k=exf_k)
  if NB:
    clf = MultinomialNB(alpha=paras[1])
  else:
    clf = LinearSVC(loss=paras[1],C=paras[2],max_iter=paras[3],class_weight=paras[4])
  X = selected[:,0]
  Y = selected[:,3]
  skf = StratifiedKFold(n_splits=kfolds)
  ifold = 0
  for train_index, test_index in skf.split(X, Y):
    trainX, testX = X[train_index], X[test_index]
    trainY, testY = Y[train_index], Y[test_index]

    trainX = cv.fit_transform(trainX)
    testX = cv.transform(testX)
    if method == 2:
      trainX = hstack((trainX, selected[train_index, 6:9].astype(float)))
      testX = hstack((testX, selected[test_index, 6:9].astype(float)))
    trainX = kBest.fit_transform(trainX, trainY)
    testX = kBest.transform(testX)

    clf.fit(trainX, trainY)
    pred = clf.predict(testX)
    #scorei = metrics.accuracy_score(testY, pred)
    #print(scorei)

    ifold += 1
    score += metrics.accuracy_score(testY, pred)
    #average in f1 could be any of 'macro','micro','weighted'
    f1 += metrics.f1_score(testY, pred, average='micro')
    #if ifold == 2:
  print("The top20 features are: ")
  print(top20(X[train_index], Y[train_index]))
  if example==True:
    print(metrics.classification_report(testY, pred, target_names=['con','pro']))
    print(metrics.confusion_matrix(testY, pred))
    analyze(selected,test_index,testX,clf)
      
      
  print("The averaged accuracy score of 5-fold cv is %0.4f and f1 score is %0.4f." %(score/5, f1/5))


In [0]:
X = data[:,0]
Y = data[:,3]
skf = StratifiedKFold(n_splits=5)
ifold = 0
for train_index, test_index in skf.split(X, Y):
  print(test_index)

In [0]:
def analyze(selected,test_index, testX, clf):
  testY = selected[test_index, 3]
  post = selected[test_index, 0]
  pred = clf.predict(testX)
  nexample = 0
  for i in range(len(testY)):
    if testY[i] != pred[i] and nexample < 5:
      print(pred[i],testY[i],post[i],'\n')
      nexample += 1
    

In [184]:
run_best(data, (500, 'hinge', 1, 1000, None),method=1,NB=False,example=True)



The top20 features are: 
20 20
['abortionist', 'awareness', 'choice', 'consciousness', 'foetus', 'force', 'human', 'illegal', 'kill', 'life human', 'lives', 'living human', 'opinion', 'potential', 'potential life', 'pregnancy', 'question', 'society', 'survive outside', 've']
              precision    recall  f1-score   support

         con       0.52      0.64      0.58       152
         pro       0.65      0.53      0.59       189

    accuracy                           0.58       341
   macro avg       0.59      0.59      0.58       341
weighted avg       0.59      0.58      0.58       341

[[ 97  55]
 [ 88 101]]
pro con I 'm against abortion ! An abortion can result in medical complications later in life . In the future , woman who did abortion may never be a mother , even if she really want it . Another reason is that this is reduce human rights . The unborn child is alive from the moment of conception and has the right to life . 

pro con It 's a degradation when it deviates fr

In [0]:
paras = (500,1)
cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
exf_k = paras[0]
kBest = SelectKBest(chi2, k=20)
clf = MultinomialNB(alpha=paras[1])

Y = data[:,3]
X = data[:,0]
X = cv.fit_transform(X)
X = kBest.fit_transform(X,Y)

In [122]:
names = cv.get_feature_names()
print(np.shape(names),names[:2])

(155154,) ['00', '00 01']


In [124]:
k_feature_index = kBest.get_support(indices=True)
print(np.shape(k_feature_index),k_feature_index[:2])

(20,) [ 4680 13912]


In [125]:
names[k_feature_index[0]]

'abortionist'

In [127]:
for i in k_feature_index:
  print(names[i])

abortionist
awareness
begins conception
choice
conception
consciousness
foetus
force
human
illegal
kill
life begins conception
living human
potential
potential life
pregnancy
society
survive outside
unto
ve
