In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import tensorflow as tf

### Exercise 2.1

Import and inspect the data

In [10]:
data = pd.read_pickle("labeled_tweets.p")

In [11]:
data.head()

Unnamed: 0,text_a,text_b,label
0,RT @WaysMeansCmte: Republican Senators need to...,Laid-off workers set up soup kitchens in front...,2
1,Jeff Van Drew sold out his district and his co...,Pitch in to help Amy Kennedy defeat Jeff Van D...,0
2,Speaker Pelosi has failed the American people—...,House Minority Leader McCarthy: Pelosi touts D...,1
3,To learn more about global efforts to #EndPoli...,"Home | End Polio. With your help, we can end p...",1
4,RT @realDailyWire: BREAKING: Hunter Biden Rece...,Hunter Biden Received Millions From Wife Of Ex...,0


In [12]:
print(data['label'].value_counts())

1    1377
0    1275
2     230
Name: label, dtype: int64


(0 = affirmative, 1 = negotiated, 2 =oppositional)

So in our dataset there are 1377 valued negotiated 1275 valued affirmative and 230 valued oppositional 

#### 1    Logistic Regression

#### 1.1.Train a logistic regression classifier

In [14]:
#first we will pick our train and test set in 80:20 proportion
X = list(data.text_a.values)
y = list(data.label.values)# the labels we want to predict --> Y
labels = ['affirmative', 'negotiated','oppositional']

X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
#then we make a vocabulary of the corpus
cv = CountVectorizer() # this initializes the CountVectorizer 

cv.fit(X_train_str) # create the vocabulary (using only train set!! beacause we dont want to use the test set my opinion)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
#we will now create our arrays
X_train = cv.transform(X_train_str)
X_test = cv.transform(X_test_str)

In [21]:
#print(X_train.toarray()[0]) #if we want to visualize the array 

In [25]:
#if we want the results of the CountVectorizer
vocabulary = cv.get_feature_names()
vectorized_texts = pd.DataFrame(X_train.toarray(), columns=vocabulary)
vectorized_texts.head(5)

Unnamed: 0,000,01dju1d7qc,02,030dnk8aky,040tozx3x9,04knsnijuq,04mohpm9q0,05,05dagzqjuq,06nrbvul11,...,zxqv1atnft,zxxsznkil9,zyafklfryj,zyahjnplbe,zzjwvjrdtd,zzkhhjk8yh,zzm2owgnv3,áñez,über,𝓸𝓾𝓻
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
#now we will train our model 
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
#now we evaluate the performance 
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

              precision    recall  f1-score   support

 affirmative       0.57      0.62      0.60       256
  negotiated       0.55      0.59      0.57       270
oppositional       0.20      0.04      0.07        51

    accuracy                           0.55       577
   macro avg       0.44      0.42      0.41       577
weighted avg       0.53      0.55      0.54       577



Evaluation of the model: we see that we have almost 0.6 f1-score which is the mean of precision and recall for affirmative and negotiated but only 0.07 f1-score for oppositional which is far from good score
Lets try the random relecting

In [35]:
#random selecting
random_preds = [random.randint(0,2) for i in range(len(y_test))]

print(classification_report(y_test, random_preds, 
                          target_names=labels))

              precision    recall  f1-score   support

 affirmative       0.43      0.32      0.36       256
  negotiated       0.45      0.33      0.38       270
oppositional       0.11      0.41      0.17        51

    accuracy                           0.33       577
   macro avg       0.33      0.35      0.30       577
weighted avg       0.41      0.33      0.35       577



We see that we have better f1-score for oppositional but worse score for the other two.

#### 1.2.Try to interpret the model

In [36]:
vocabulary = cv.get_feature_names()
regression_coefficients = lr.coef_[0] # get the LR weights we have 3 types so 0 is the affirmative
vocab_coef_combined = list(zip(regression_coefficients, vocabulary)) # this combines two separate lists [1, 2], ['word1', 'word2'] into one list [[1, 'word1'], [2, 'word2']]

feature_importance = pd.DataFrame(vocab_coef_combined,
                      columns=['coef', 'word'])
feature_importance.sort_values('coef', ascending=False).head(10)

Unnamed: 0,coef,word
5583,0.738232,helping
10305,0.713591,rt
2839,0.704428,complete
3189,0.635511,cruz
1820,0.607617,below
6493,0.595824,job
5114,0.590113,generation
11553,0.585049,team
7098,0.581932,live
2887,0.578063,confirmation


We can see that for the word helping has the highest weight for affirmative titles

In [39]:
vocabulary = cv.get_feature_names()
regression_coefficients = lr.coef_[1] # get the LR weights we have 3 types so 0 is the affirmative
vocab_coef_combined = list(zip(regression_coefficients, vocabulary)) # this combines two separate lists [1, 2], ['word1', 'word2'] into one list [[1, 'word1'], [2, 'word2']]

feature_importance = pd.DataFrame(vocab_coef_combined,
                      columns=['coef', 'word'])
feature_importance.sort_values('coef', ascending=False).head(10)

Unnamed: 0,coef,word
5371,0.719113,guard
8720,0.665114,passing
9791,0.613779,remains
12778,0.596099,were
11886,0.588975,trade
9072,0.582243,post
2049,0.578441,box
8701,0.57751,partisan
7474,0.554794,means
8969,0.552816,please


We can see that for the word guard has the highest weight for negotiated titles

In [38]:
vocabulary = cv.get_feature_names()
regression_coefficients = lr.coef_[2] # get the LR weights we have 3 types so 0 is the affirmative
vocab_coef_combined = list(zip(regression_coefficients, vocabulary)) # this combines two separate lists [1, 2], ['word1', 'word2'] into one list [[1, 'word1'], [2, 'word2']]

feature_importance = pd.DataFrame(vocab_coef_combined,
                      columns=['coef', 'word'])
feature_importance.sort_values('coef', ascending=False).head(10)

Unnamed: 0,coef,word
1445,0.606004,around
5598,0.604694,heroes
2475,0.594951,chairman
12149,0.585844,unacceptable
5020,0.573029,fvmwrb09nu
9862,0.573029,repdonbeyer
7906,0.573029,mypuntxhcr
1673,0.554722,bad
8198,0.544191,not
6325,0.541234,issues


We can see that for the word around has the highest weight for oppositional titles

#### 1.3.Use TF-IDF features instead of raw counts

In [41]:
tfidf = TfidfVectorizer() 

tfidf.fit(X_train_str) # create the vocabulary

X_train_idf = tfidf.transform(X_train_str)
X_test = tfidf.transform(X_test_str)

In [43]:
lr = LogisticRegression(solver='lbfgs',max_iter = 1000)
lr.fit(X_train_idf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

              precision    recall  f1-score   support

 affirmative       0.64      0.56      0.60       256
  negotiated       0.55      0.72      0.62       270
oppositional       0.00      0.00      0.00        51

    accuracy                           0.59       577
   macro avg       0.40      0.43      0.41       577
weighted avg       0.54      0.59      0.56       577



  _warn_prf(average, modifier, msg_start, len(result))


### 2 BERT: supervised 