In [1]:
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report


#### Loading and splitting the data:

In [2]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [3]:
#cleaning out empty comments:
empty_comments = data["comment"].isna()
empty_comments = data[empty_comments].index
data.drop(empty_comments, axis=0, inplace=True)

In [4]:
c_train, c_test, l_train, l_test = train_test_split(data["comment"], data["label"], test_size=0.3, random_state=50)

## **Bag of Words**
#### CountVectorizer:

In [12]:
c_vec = CountVectorizer(min_df=1, # in how many documents the term minimally occurs
                             tokenizer=nltk.word_tokenize) # nltk tokenizer

In [None]:
com_vector_data = c_vec.fit_transform(c_train)

In [None]:
lin_classifier = svm.LinearSVC()
lin_classifier.fit(com_vector_data,l_train)

In [None]:
test_vector_data = c_vec.transform(c_test)

In [10]:
prediction = lin_classifier.predict(test_vector_data)

In [11]:
report1 = classification_report(l_test,prediction,digits = 3)
print(report1)

              precision    recall  f1-score   support

           0      0.684     0.721     0.702    151624
           1      0.705     0.668     0.686    151608

    accuracy                          0.694    303232
   macro avg      0.695     0.694     0.694    303232
weighted avg      0.695     0.694     0.694    303232



#### TF-IDF

In [13]:
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(com_vector_data)

In [14]:
lin_classifier2 = svm.LinearSVC()
lin_classifier2.fit(train_tfidf,l_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [16]:
test_tfidf = tfidf_transformer.transform(test_vector_data)

In [17]:
predicted2 = lin_classifier2.predict(test_tfidf)

In [18]:
report2 = classification_report(l_test,predicted2,digits = 3)
print(report2)

              precision    recall  f1-score   support

           0      0.688     0.714     0.701    151624
           1      0.702     0.676     0.689    151608

    accuracy                          0.695    303232
   macro avg      0.695     0.695     0.695    303232
weighted avg      0.695     0.695     0.695    303232



### **Hetzelfde maar zonder stopwoorden**

#### Countvectorizer:

In [5]:
c_vec2= CountVectorizer(min_df=1,
                             tokenizer=nltk.word_tokenize, 
                             stop_words=stopwords.words('english')) # stopwords are removed

In [7]:
# print(c_vec2.stop_words)

In [20]:
vectorized_com = c_vec2.fit_transform(c_train)

  'stop_words.' % sorted(inconsistent))


In [21]:
lin_classifier3 = svm.LinearSVC()
lin_classifier3.fit(vectorized_com,l_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [22]:
vectorized_test = c_vec2.transform(c_test)

In [23]:
predicted = lin_classifier3.predict(vectorized_test)
report3 = classification_report(l_test,predicted,digits = 3)
print(report3)

              precision    recall  f1-score   support

           0      0.673     0.709     0.691    151624
           1      0.693     0.656     0.674    151608

    accuracy                          0.683    303232
   macro avg      0.683     0.683     0.682    303232
weighted avg      0.683     0.683     0.682    303232



#### TF-IDF:

In [25]:
tfidf_transformer = TfidfTransformer()
train_tfidf2 = tfidf_transformer.fit_transform(vectorized_com)

In [26]:
lin_classifier4 = svm.LinearSVC()
lin_classifier4.fit(train_tfidf2,l_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [29]:
test_tfidf2 = tfidf_transformer.transform(vectorized_test)

In [30]:
predicted2 = lin_classifier4.predict(test_tfidf2)

In [31]:
report4 = classification_report(l_test,predicted2,digits = 3)
print(report4)

              precision    recall  f1-score   support

           0      0.676     0.700     0.688    151624
           1      0.689     0.665     0.677    151608

    accuracy                          0.682    303232
   macro avg      0.683     0.682     0.682    303232
weighted avg      0.683     0.682     0.682    303232

