In [1]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report


#### Loading and splitting the data:

In [2]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [3]:
#cleaning out empty comments:
empty_comments = data["comment"].isna()
empty_comments = data[empty_comments].index
data.drop(empty_comments, axis=0, inplace=True)

In [4]:
c_train, c_test, l_train, l_test = train_test_split(data["comment"], data["label"], test_size=0.3, random_state=50)

## **Bag of Words**
#### CountVectorizer:

In [5]:
c_vec = CountVectorizer(min_df=1, # in how many documents the term minimally occurs
                             tokenizer=nltk.word_tokenize) # nltk tokenizer

In [6]:
com_vector_data = c_vec.fit_transform(c_train)

In [7]:
lin_classifier = svm.LinearSVC()
lin_classifier.fit(com_vector_data,l_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [8]:
test_vector_data = c_vec.transform(c_test)

In [9]:
prediction = lin_classifier.predict(test_vector_data)

In [10]:
report1 = classification_report(l_test,prediction,digits = 3)
print(report1)

              precision    recall  f1-score   support

           0      0.684     0.721     0.702    151624
           1      0.705     0.668     0.686    151608

    accuracy                          0.694    303232
   macro avg      0.695     0.694     0.694    303232
weighted avg      0.695     0.694     0.694    303232



#### TF-IDF

In [11]:
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(com_vector_data)

In [12]:
lin_classifier2 = svm.LinearSVC()
lin_classifier2.fit(train_tfidf,l_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [13]:
test_tfidf = tfidf_transformer.transform(test_vector_data)

In [14]:
predicted2 = lin_classifier2.predict(test_tfidf)

In [15]:
report2 = classification_report(l_test,predicted2,digits = 3)
print(report2)

              precision    recall  f1-score   support

           0      0.688     0.714     0.701    151624
           1      0.702     0.676     0.689    151608

    accuracy                          0.695    303232
   macro avg      0.695     0.695     0.695    303232
weighted avg      0.695     0.695     0.695    303232



## Cosine similarity
#### similarity between the vectorized training sets

In [20]:
#print(cv_array.shape)
#print(tfidf_array.shape)

cv_array = com_vector_data[0].toarray()
tfidf_array = train_tfidf[0].toarray()

cv_array = cv_array.flatten()
tfidf_array = tfidf_array.flatten()

# manually compute cosine similarity
dot = numpy.dot(cv_array, tfidf_array)
norm_BoW = numpy.linalg.norm(cv_array)
norm_tfidf = numpy.linalg.norm(tfidf_array)
cos = dot / (norm_BoW * norm_tfidf)
 
print('Dot product = %.3f, normalized BoW = %.3f, normalized tfidf = %.3f, cosine similarity = %.3f' %(dot,norm_BoW, norm_tfidf,cos))

Dot product = 2.294, normalized BoW = 2.449, normalized tfidf = 1.000, cosine similarity = 0.936


#### similarity between predictions of both vectorisation methods

In [22]:
#print(predictions.shape)
#print(predicted2.shape)

# manually compute cosine similarity
dot = numpy.dot(prediction, predicted2)
norm_BoW = numpy.linalg.norm(prediction)
norm_tfidf = numpy.linalg.norm(predicted2)
cos = dot / (norm_BoW * norm_tfidf)
 
print('Dot product = %.3f, normalized BoW = %.3f, normalized tfidf = %.3f, cosine similarity = %.3f' %(dot,norm_BoW, norm_tfidf,cos))

Dot product = 135355.000, normalized BoW = 378.918, normalized tfidf = 381.965, cosine similarity = 0.935


### **Hetzelfde maar zonder stopwoorden**

#### Countvectorizer:

In [23]:
c_vec2= CountVectorizer(min_df=1,
                             tokenizer=nltk.word_tokenize, 
                             stop_words=stopwords.words('english')) # stopwords are removed

In [7]:
# print(c_vec2.stop_words)

In [24]:
vectorized_com = c_vec2.fit_transform(c_train)

  'stop_words.' % sorted(inconsistent))


In [25]:
lin_classifier3 = svm.LinearSVC()
lin_classifier3.fit(vectorized_com,l_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [26]:
vectorized_test = c_vec2.transform(c_test)

In [27]:
predicted = lin_classifier3.predict(vectorized_test)
report3 = classification_report(l_test,predicted,digits = 3)
print(report3)

              precision    recall  f1-score   support

           0      0.673     0.710     0.691    151624
           1      0.693     0.656     0.674    151608

    accuracy                          0.683    303232
   macro avg      0.683     0.683     0.682    303232
weighted avg      0.683     0.683     0.682    303232



#### TF-IDF:

In [29]:
tfidf_transformer = TfidfTransformer()
train_tfidf2 = tfidf_transformer.fit_transform(vectorized_com)

In [30]:
lin_classifier4 = svm.LinearSVC()
lin_classifier4.fit(train_tfidf2,l_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [31]:
test_tfidf2 = tfidf_transformer.transform(vectorized_test)

In [32]:
prediction2 = lin_classifier4.predict(test_tfidf2)

In [33]:
report4 = classification_report(l_test,predicted2,digits = 3)
print(report4)

              precision    recall  f1-score   support

           0      0.688     0.714     0.701    151624
           1      0.702     0.676     0.689    151608

    accuracy                          0.695    303232
   macro avg      0.695     0.695     0.695    303232
weighted avg      0.695     0.695     0.695    303232



## Cosine similarity
#### similarity between the vectorized training sets

In [35]:
#print(cv_array.shape)
#print(tfidf_array.shape)

cv_array2 = vectorized_com[0].toarray()
tfidf_array2 = train_tfidf2[0].toarray()

cv_array2 = cv_array.flatten()
tfidf_array2 = tfidf_array.flatten()

# manually compute cosine similarity
dot = numpy.dot(cv_array2, tfidf_array2)
norm_BoW = numpy.linalg.norm(cv_array2)
norm_tfidf = numpy.linalg.norm(tfidf_array2)
cos = dot / (norm_BoW * norm_tfidf)
 
print('Dot product = %.3f, normalized BoW = %.3f, normalized tfidf = %.3f, cosine similarity = %.3f' %(dot,norm_BoW, norm_tfidf,cos))

Dot product = 2.294, normalized BoW = 2.449, normalized tfidf = 1.000, cosine similarity = 0.936


#### similarity between predictions of both vectorisation methods

In [37]:
#print(predictions.shape)
#print(predicted2.shape)

# manually compute cosine similarity
dot = numpy.dot(prediction2, predicted)
norm_BoW = numpy.linalg.norm(predicted)
norm_tfidf = numpy.linalg.norm(prediction2)
cos = dot / (norm_BoW * norm_tfidf)
 
print('Dot product = %.3f, normalized BoW = %.3f, normalized tfidf = %.3f, cosine similarity = %.3f' %(dot,norm_BoW, norm_tfidf,cos))

Dot product = 134910.000, normalized BoW = 378.707, normalized tfidf = 382.578, cosine similarity = 0.931
