Importing All the necessary libraries here:

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from scipy.sparse import hstack
from sklearn.metrics import confusion_matrix
import matplotlib as plt
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

Load the dataset:

In [2]:
data_frame = pd.read_csv("../main_datasets/sarcasm_data.csv")
data_frame.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,label,comment,votes,sentiment
0,0,0,0,NC and NH.,-1,0
1,1,1,0,You do know west teams play against west teams...,-1,1
2,2,2,0,"They were underdogs earlier today, but since G...",3,0
3,3,3,0,"This meme isn't funny none of the ""new york ni...",-1,0
4,4,4,0,I could use one of those tools.,-1,1


In [3]:
data_frame = data_frame[["label", "comment", "votes", "sentiment"]]
data_frame.head()

Unnamed: 0,label,comment,votes,sentiment
0,0,NC and NH.,-1,0
1,0,You do know west teams play against west teams...,-1,1
2,0,"They were underdogs earlier today, but since G...",3,0
3,0,"This meme isn't funny none of the ""new york ni...",-1,0
4,0,I could use one of those tools.,-1,1


Lets perform some data preprocessing:


In [4]:
def preprocess(text):
    return simple_preprocess(text, deacc=True, min_len=2, max_len=15)

data_frame['preprocessed_text'] = data_frame['comment'].apply(preprocess)
data_frame.head()

Unnamed: 0,label,comment,votes,sentiment,preprocessed_text
0,0,NC and NH.,-1,0,"[nc, and, nh]"
1,0,You do know west teams play against west teams...,-1,1,"[you, do, know, west, teams, play, against, we..."
2,0,"They were underdogs earlier today, but since G...",3,0,"[they, were, underdogs, earlier, today, but, s..."
3,0,"This meme isn't funny none of the ""new york ni...",-1,0,"[this, meme, isn, funny, none, of, the, new, y..."
4,0,I could use one of those tools.,-1,1,"[could, use, one, of, those, tools]"


In [9]:
embedding_size = 300
word2vec_model = Word2Vec(data_frame['preprocessed_text'], vector_size=embedding_size, window=5, min_count=2, workers=4)
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x17bcf7a2550>

Lets create the word embeddings:

In [10]:
def text_to_embedding(text):
    words = [word for word in text if word in word2vec_model.wv]
    if words:
        return np.mean(word2vec_model.wv[words], axis=0)
    else:
        return np.zeros(embedding_size)

data_frame['embedding'] = data_frame['preprocessed_text'].apply(text_to_embedding)

Create train and target sets:

In [None]:
embedding_df = pd.DataFrame(data_frame['embedding'].tolist())
expanded_df = pd.concat([data_frame.drop('embedding', axis=1), embedding_df], axis=1)
X = expanded_df.drop('label', axis=1)
y = expanded_df['label']
X =X[:10]
y = y[:10]
print(X)
print(y)



Lets split the dataset into train and test:

In [12]:
X = X.drop(columns=["comment", "preprocessed_text"])
X.columns = X.columns.astype(str)
X

Unnamed: 0,votes,sentiment,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,-1,0,0.069436,0.058620,-0.007191,-0.162119,0.183749,0.100342,0.206979,0.170543,...,-0.006388,-0.181470,0.129758,-0.045028,-0.108428,0.022880,-0.152967,-0.183494,0.144136,-0.332737
1,-1,1,0.747770,-0.468337,0.361604,0.081458,0.003537,0.023422,0.116770,-0.206040,...,0.297724,-0.089554,0.392642,-0.242409,0.241880,-0.517089,-0.245876,0.731407,0.864662,-0.543102
2,3,0,0.346316,-0.100375,0.009280,0.212601,-0.296036,-0.013553,0.031915,0.095010,...,-0.055165,-0.169227,0.122330,0.087534,-0.414220,-0.292389,-0.700323,-0.039280,0.128638,-0.190921
3,-1,0,0.299445,-0.223231,-0.182306,0.186459,0.071061,-0.217896,-0.120852,-0.155474,...,-0.182751,-0.192406,0.123909,0.432580,-0.091262,-0.472429,0.045889,-0.023268,-0.115124,0.097869
4,-1,1,0.727203,0.130307,0.063987,0.338863,-0.307654,-0.159473,-0.139372,-0.236934,...,-0.239723,-0.138786,0.145431,0.192865,-0.426846,0.587483,-0.681206,-0.119991,-0.279150,0.444575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010768,2,0,0.330440,0.087486,0.452379,0.125483,-0.001305,0.114950,-0.024944,0.343929,...,-0.034152,-0.136933,0.278267,-0.067348,-0.148897,-0.327650,-0.221718,0.062261,0.218916,-0.063911
1010769,1,1,0.580698,0.530087,-0.197684,0.132761,-0.330144,-0.198488,0.495150,0.437816,...,0.229036,-0.513346,0.012858,-0.809313,0.115431,-0.371767,0.083185,0.224939,0.312081,-0.217166
1010770,1,1,0.286786,0.171989,0.317835,0.126824,0.100483,-0.077389,0.100831,0.458157,...,-0.191509,-0.304807,-0.153368,-0.395883,0.174888,-0.440270,-0.011585,-0.053263,0.237088,0.337024
1010771,1,1,0.084776,-0.436938,0.323241,0.131427,-0.002435,-0.150780,0.146820,0.198503,...,-0.342561,0.239435,0.207670,0.390321,0.184193,0.012560,-0.407828,-0.157697,-0.167322,-0.072448


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

Lets train the SVM:

In [None]:
model = svm.SVC()
model.fit(X_train, y_train)
model