In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from scipy.stats import expon, randint

# Data preprocessing and cleaning

In [3]:
# Getting the data
data_url1 = 'http://www.cse.chalmers.se/~richajo/dit866/data/a3_first_sample.tsv'
data_url2 = 'http://www.cse.chalmers.se/~richajo/dit866/assignments/a3/data/a3_train_round1.tsv'
data_url3 = 'http://www.cse.chalmers.se/~richajo/dit866/assignments/a3/data/a3_train_final.tsv'
data_url_test = 'http://www.cse.chalmers.se/~richajo/dit866/assignments/a3/data/a3_test.tsv'
dataTrain = pd.read_csv(data_url3, sep='\t', names=['stance', 'text'])
dataTest = pd.read_csv(data_url_test, sep = '\t', names = ['stance', 'text'])

Xtrain = dataTrain['text']
Ytrain = dataTrain['stance']

Xtest = dataTest['text']
Ytest = dataTest['stance']

Lets take alook at the data:

In [None]:
Ytrain

0       0/-1
1        0/0
2       0/-1
3       1/-1
4       0/-1
        ... 
8783     1/1
8784     0/0
8785     0/0
8786     0/0
8787     0/0
Name: stance, Length: 8788, dtype: object

We can see that these label will be diffucilt to use for aclassifactions so we will have to create afuntion that deals with this.

In [4]:
# this function looks at each cell and determines wheter or not the votes are 
# uniform.
# All 0's: return negative (0)
# All 1's: return positive (1)
# Mixed:   return unsure (-1)
def process_opinions(cell):
  if '-' not in cell: # if no -1
    votes = []
    for ch in cell[::2]:
      votes.append(int(ch))
    if sum(votes) == 0: # if all 0's
      return 0
    elif sum(votes) == len(votes): # if all 1's
      return 1
    else:
      return -1
  else:
    return -1 # if mixed 


Now we can take a look at how the data is distributed.

In [None]:
Ytrain.apply(process_opinions).value_counts()

positive    3890
negative    3626
-1          1272
Name: stance, dtype: int64

We will now clean the data by removing the unsure observations.

In [5]:
Ytrain_processed = Ytrain.apply(process_opinions)
ind_certain = (Ytrain_processed != -1)

Xtrain_cleaned = Xtrain[ind_certain].reset_index(drop=True)

Ytrain_cleaned = Ytrain_processed[ind_certain].reset_index(drop=True)

In [None]:
Ytrain_cleaned.value_counts()

1    3890
0    3626
Name: stance, dtype: int64

Here we transform the data using a tf-idf tansformation.

In [6]:
tdif = TfidfVectorizer()
Xtrain_tfidf = tdif.fit_transform(Xtrain_cleaned)
Xtest_tfidf = tdif.transform(Xtest)

# Model selection

Here we will try logistic regression

In [None]:
clf_lr = LogisticRegression()
C_distr = expon(scale=1)
param_grid = {'C': C_distr, 'max_iter': [1000]}
gridsearch = RandomizedSearchCV(clf_lr, param_grid)
gridsearch.fit(Xtrain_tfidf, Ytrain_cleaned)
best = gridsearch.best_params_
best_C = best['C']
print(f'The best value for C: {best_C}')

The best value for C: 4.581459841714967


In [None]:
clf_lr = LogisticRegression(C=best['C'])
cv_lr = cross_val_score(clf_lr, Xtrain_tfidf, Ytrain_cleaned)
print(cv_lr.mean(), cv_lr.var())

0.8304933572571169 0.00010539397634422945


Here will will try a perceptron classifier

In [None]:
clf_per = Perceptron()
cross_val_score(clf_per, Xtrain_tfidf, Ytrain_cleaned).mean()

0.7944354199402612

And finally the support vector machine classifier

In [None]:
clf_svc = SVC()

gamma_distr = expon(scale=1)
C_distr = expon(scale=1)
degree_distr = randint(1,10)
param_grid_random = {'gamma': gamma_distr, 'C': C_distr, 'degree': degree_distr,
                     'kernel': ['linear', 'rbf', 'poly']}

randomsearch = RandomizedSearchCV(clf_svc, param_grid_random, n_iter=5, random_state=42)
randomsearch.fit(Xtrain_tfidf, Ytrain_cleaned)
best_svc = randomsearch.best_params_
best_svc

{'C': 0.9190821536272645,
 'degree': 8,
 'gamma': 1.052363846077553,
 'kernel': 'linear'}

In [7]:
best_C = 0.919082
best_gamma = 1.052364

In [8]:
clf_svc = SVC(C=best_C, kernel='linear')
cvs_svc = cross_val_score(clf_svc, Xtrain_tfidf, Ytrain_cleaned)
print(cvs_svc.mean(), cvs_svc.var())

0.8315574524709446 0.00011379852504303438


# Model evaluation
Since the SVC model performed the best we will now evaluate it on the train data and later take a clooser look at it.

In [9]:
clf_svc.fit(Xtrain_tfidf, Ytrain_cleaned)

SVC(C=0.919082, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Here below we will see the reulst for our model.

In [22]:
Ypred = clf_svc.predict(Xtest_tfidf)
print(accuracy_score(Ytest, Ypred))
confusion_matrix(Ytest,Ypred)

0.8801020408163265


array([[167,  31],
       [ 16, 178]])

# Taking a closer look at the model

Here below we can see all the wrongly classified observations.

In [23]:
test_data = pd.concat([Xtest, Ytest], axis=1)
wrong_classifications = test_data.stance != Ypred
test_data[wrong_classifications]

Unnamed: 0,text,stance
13,"Alright, so now look into the covid protein sp...",0
19,AstraZeneca seems to be more effective and res...,1
21,Biontech and curevac basicly startet the mrna ...,1
31,Can’t wait until my son can get it.,1
34,Covid and the mask isn’t the problem. It’s the...,0
48,Even if u have the vaccine u dont get to go ba...,0
67,Got my 2nd dose of vaccine no symptoms and no ...,1
95,I can't believe anyone is fine with having gen...,0
103,I don't understand why 20 percent of careworke...,1
110,I feel so bad for all the people who are being...,0


The code below gathers the top ten weights with the lowest value and the top ten weights with the highest value and finds the intercept. 

In [14]:
intercept = round(clf_svc.intercept_[0], ndigits=2)
print(f'Intercept: {intercept}')

coef = clf_svc.coef_
highest = coef.A.argsort()[0][-10:][::-1]
lowest = coef.A.argsort()[0][:10]

print('Highest scoring parameters')
for ind in highest:
  word = tdif.get_feature_names()[ind]
  value = round(coef.A[0][ind], ndigits = 2)
  print(f'param = {value} for {word}')

print('Lowest scoring parameters')
for ind in lowest:
  word = tdif.get_feature_names()[ind]
  value = round(coef.A[0][ind], ndigits = 2)
  print(f'param = {value} for {word}')

Intercept: -0.16
Highest scoring parameters
param = 2.63 for yes
param = 2.58 for science
param = 2.54 for vaxxers
param = 2.39 for anti
param = 2.32 for get
param = 2.18 for hope
param = 2.17 for every
param = 2.16 for available
param = 2.07 for vaccinated
param = 2.06 for great
Lowest scoring parameters
param = -4.69 for not
param = -3.66 for never
param = -2.98 for no
param = -2.96 for rushed
param = -2.85 for experimental
param = -2.8 for poison
param = -2.63 for years
param = -2.53 for test
param = -2.44 for term
param = -2.41 for don


This function prints out all the parameter values and does the computation for us. 

In [26]:
def print_calc(word):
  word = tdif.transform(word)
  word_index = np.where(word.A[0]!=0)[0]
  tdif_value = []
  word_weight = []
  word_order = []
  intercept = clf_svc.intercept_
  for ind in word_index:
    word_order.append(tdif.get_feature_names()[ind])
    word_weight.append(round(coef.A[0][ind], ndigits = 2))
    tdif_value.append(round(word[0,ind],ndigits=2))

  print(word_weight)
  print(tdif_value)
  print(word_order)
  print()
  print(np.dot(word_weight, tdif_value) + intercept)
  print(clf_svc.predict(word))
  return None


In [31]:
print_calc(["I think the vaccine is ok"])

[0.49, -0.16, 0.81, -0.89, 0.12]
[0.3, 0.72, 0.19, 0.56, 0.22]
['is', 'ok', 'the', 'think', 'vaccine']

[-0.44703064]
[0]


In [30]:
print_calc(["the vaccines don’t even work"])

[-2.41, -0.05, 0.81, 0.74, 1.44]
[0.43, 0.56, 0.2, 0.38, 0.57]
['don', 'even', 'the', 'vaccines', 'work']

[0.03896936]
[1]


In [32]:
print_calc(["Alright, so now look into the covid protein spike and it's prion capable modification. Make sure you understand what you are getting yourself into when you take those shots, and it will also shed some light on the side effects you may have seen."])

[0.76, 0.26, 1.04, 0.11, -0.38, 0.57, 0.48, 1.23, -0.05, -1.12, -0.0, -0.21, -1.13, -0.75, 0.44, 0.0, 0.36, -0.16, -0.49, 0.44, 0.67, 0.66, 0.87, -0.14, -0.14, -0.86, 0.35, 0.81, 0.65, 1.65, -1.79, 0.49, 0.1, -0.7, 0.45]
[0.25, 0.14, 0.12, 0.09, 0.25, 0.09, 0.12, 0.11, 0.09, 0.29, 0.13, 0.21, 0.18, 0.14, 0.16, 0.27, 0.13, 0.1, 0.2, 0.18, 0.18, 0.12, 0.09, 0.14, 0.22, 0.16, 0.09, 0.1, 0.14, 0.17, 0.12, 0.12, 0.09, 0.33, 0.18]
['alright', 'also', 'and', 'are', 'capable', 'covid', 'effects', 'getting', 'have', 'into', 'it', 'light', 'look', 'make', 'may', 'modification', 'now', 'on', 'protein', 'seen', 'shots', 'side', 'so', 'some', 'spike', 'sure', 'take', 'the', 'those', 'understand', 'what', 'when', 'will', 'you', 'yourself']

[0.02726936]
[1]
