# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from farasa.stemmer import FarasaStemmer
import gensim
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import emojis

import helper

# Read and Clean 

In [None]:
df_train = pd.read_csv(r"data/ArSarcasm_train.csv")
df_test = pd.read_csv(r"data/ArSarcasm_test.csv")

In [None]:
stemmer =  FarasaStemmer(interactive=True)

In [None]:
df_train['cleaned_tweet'] = df_train['tweet'].apply(helper.clean_tweet)
df_test['cleaned_tweet'] = df_test['tweet'].apply(helper.clean_tweet)

In [None]:
df_train['cleaned_tweet'] = df_train['cleaned_tweet'].apply(lambda x: stemmer.stem(x))
df_test['cleaned_tweet'] = df_test['cleaned_tweet'].apply(lambda x: stemmer.stem(x))

# Remove stopwords

In [None]:
df_train["cleaned_tweet"] = df_train["cleaned_tweet"].apply(helper.remove_stopWords)
df_test["cleaned_tweet"] = df_test["cleaned_tweet"].apply(helper.remove_stopWords)

# Build our Vocab List

In [None]:
vocab = []
for tw in df_train['cleaned_tweet']:
    for word in tw.split():
        if word not in vocab:
            vocab.append(word)

vocab = sorted(vocab)

In [None]:
print(len(vocab))

# Prepare Data to Train

In [None]:
train_data = df_train[['sarcasm', 'cleaned_tweet']]
test_data = df_test[['sarcasm', 'cleaned_tweet']]

In [None]:
train_data['sarcasm'] = train_data['sarcasm'].apply(lambda x: 1 if x==True else 0)
test_data['sarcasm'] = test_data['sarcasm'].apply(lambda x: 1 if x==True else 0)

In [None]:
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)
X_train = train_data['cleaned_tweet']
X_test = test_data['cleaned_tweet']
y_train = train_data['sarcasm'].tolist()
y_test = test_data['sarcasm'].tolist()

print('Size of train data: {}'.format(len(X_train)))
print('Size of test data: {}'.format(len(X_test)))
print('Size of train labels: {}'.format(len(y_train)))
print('Size of test labels: {}'.format(len(y_test)))

## Embedding Visualization

### Bag of Word (BOW)

In [None]:
X_train_bow_emb, count_vectorizer = helper.bow_emb(X_train)
X_test_bow_emb = count_vectorizer.transform(X_test)

fig = plt.figure(figsize=(16, 16))          
helper.plot_LSA(X_train_bow_emb, y_train)
plt.show()

### TF-IDF

In [None]:
X_train_tfidf_emb, tfidf_vectorizer = helper.tfidf(X_train)
X_test_tfidf_emb = tfidf_vectorizer.transform(X_test)

fig = plt.figure(figsize=(16, 16))          
helper.plot_LSA(X_train_tfidf_emb, y_train)
plt.show()

### Pre-trained Word2Vec Model

In [None]:
word2vec_path = "sg_100.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, unicode_errors='ignore')   

tokenizer = RegexpTokenizer(r'\w+')
X_train_tokens = train_data['cleaned_tweet'].apply(tokenizer.tokenize)
X_test_tokens = test_data['cleaned_tweet'].apply(tokenizer.tokenize)

train_embeddings = helper.get_word2vec_embeddings(model, X_train_tokens)
test_embeddings = helper.get_word2vec_embeddings(model, X_test_tokens)

fig = plt.figure(figsize=(16, 16))          
helper.plot_LSA(train_embeddings, y_train)
plt.show()

## Train Models

### Logistic Regression with BOW

In [None]:
lr_bow = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr_bow.fit(X_train_bow_emb, y_train)
pred_train = lr_bow.predict(X_train_bow_emb)

helper.print_train_scores(y_train, pred_train)

print('*'*50)

# Predicting on the test data
pred_test = lr_bow.predict(X_test_bow_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
lr_bow = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr_bow, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train_bow_emb, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
lr_bow = LogisticRegression(solver='newton-cg', class_weight = gridsearch.best_params_['class_weight'])
lr_bow.fit(X_train_bow_emb, y_train)
pred_train = lr_bow.predict(X_train_bow_emb)

helper.print_train_scores(y_train, pred_train)

print('*'*50)

# Predicting on the test data
pred_test = lr_bow.predict(X_test_bow_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

In [None]:
importance = helper.get_most_important_features(count_vectorizer, lr_bow, 10)

top_scores = [a[0] for a in importance[0]['tops']]
top_words = [a[1] for a in importance[0]['tops']]
bottom_scores = [a[0] for a in importance[0]['bottom']]
bottom_words = [a[1] for a in importance[0]['bottom']]

helper.plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")

### Logistic Regression with TF-IDF

In [None]:
lr_tfidf = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr_tfidf.fit(X_train_tfidf_emb, y_train)
pred_train = lr_tfidf.predict(X_train_tfidf_emb)

helper.print_train_scores(y_train, pred_train)

print('*'*50)


# Predicting on the test data
pred_test = lr_tfidf.predict(X_test_tfidf_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
lr_tfidf = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr_tfidf, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train_tfidf_emb, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
lr_tfidf = LogisticRegression(solver='newton-cg', class_weight = gridsearch.best_params_['class_weight'])
lr_tfidf.fit(X_train_tfidf_emb, y_train)

pred_train = lr_tfidf.predict(X_train_tfidf_emb)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

# Predicting on the test data
pred_test = lr_tfidf.predict(X_test_tfidf_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

In [None]:
importance = helper.get_most_important_features(tfidf_vectorizer, lr_tfidf, 10)

top_scores = [a[0] for a in importance[0]['tops']]
top_words = [a[1] for a in importance[0]['tops']]
bottom_scores = [a[0] for a in importance[0]['bottom']]
bottom_words = [a[1] for a in importance[0]['bottom']]

helper.plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")

### Logistic Regression with W2V Pre-trained Embeddings

In [None]:
lr_w2v = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr_w2v.fit(train_embeddings, y_train)

pred_train = lr_w2v.predict(train_embeddings)
helper.print_train_scores(y_train, pred_train)
print('*'*50)


# Predicting on the test data
pred_test = lr_w2v.predict(test_embeddings)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
lr_w2v = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr_w2v, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(train_embeddings, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
lr_w2v = LogisticRegression(solver='newton-cg', class_weight=gridsearch.best_params_['class_weight'])
lr_w2v.fit(train_embeddings, y_train)

pred_train = lr_w2v.predict(train_embeddings)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

# Predicting on the test data
pred_test = lr_w2v.predict(test_embeddings)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### SVM with BOW

In [None]:
svm = SVC(class_weight='balanced')
svm.fit(X_train_bow_emb, y_train)

pred_train = svm.predict(X_train_bow_emb)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

pred_test = svm.predict(X_test_bow_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
svm = SVC()

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= svm, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(n_splits=5), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train_bow_emb, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
svm = SVC(class_weight=gridsearch.best_params_['class_weight'])
svm.fit(X_train_bow_emb, y_train)

pred_train = svm.predict(X_train_bow_emb)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

# Predicting on the test data
pred_test = svm.predict(X_test_bow_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### SVM with TF-IDF

In [None]:
svm = SVC(class_weight='balanced')
svm.fit(X_train_tfidf_emb, y_train)

pred_train = svm.predict(X_train_tfidf_emb)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

pred_test = svm.predict(X_test_tfidf_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
svm = SVC()

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= svm, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train_tfidf_emb, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
svm = SVC(class_weight=gridsearch.best_params_['class_weight'])
svm.fit(X_train_tfidf_emb, y_train)

pred_train = svm.predict(X_train_tfidf_emb)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

# Predicting on the test data
pred_test = svm.predict(X_test_tfidf_emb)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### SVM with W2V Pre-trained Embeddings

In [None]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
train_scaled = scalar.fit_transform(train_embeddings)
test_scaled = scalar.fit_transform(test_embeddings)

In [None]:
svm = SVC(class_weight='balanced')
svm.fit(train_scaled, y_train)

pred_train = svm.predict(train_scaled)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

pred_test = svm.predict(test_scaled)
#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

### Handel the Imbalace Manually

In [None]:
svm = SVC()

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= svm, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(train_embeddings, y_train)

#Ploting the score for different values of weight
helper.plot_score_for_weight(gridsearch, weights)

In [None]:
svm = SVC(class_weight=gridsearch.best_params_['class_weight'])
svm.fit(train_scaled, y_train)

pred_train = svm.predict(train_scaled)
helper.print_train_scores(y_train, pred_train)
print('*'*50)

# Predicting on the test data
pred_test = svm.predict(test_scaled)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

## Emojis Embedding

In [None]:
def extract_emojis(tweet):
    return list(emojis.get(tweet))

In [None]:
emo_train = df_train['tweet'].apply(extract_emojis)
emo_test = df_test['tweet'].apply(extract_emojis)

In [None]:
emoji2vec_path = "emoji2vec.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(emoji2vec_path, binary=True, unicode_errors='ignore')   


train_emo_embeddings = helper.get_word2vec_embeddings(model, emo_train)
test_emo_embeddings = helper.get_word2vec_embeddings(model, emo_test)

In [None]:
# Word_Vector o Emoji_Vector

x_train_features_concat = []
for t, e in zip(train_scaled, train_emo_embeddings):
    x_train_features_concat.append(np.concatenate((t, e), axis=0))
    
# Word_Vector + Emoji_Vector
x_train_features_sum = []
for t, e in zip(train_scaled, train_emo_embeddings):
    x_train_features_sum.append(t + e)

In [None]:
# Word_Vector o Emoji_Vector
x_test_features_concat = []
for t, e in zip(test_scaled, test_emo_embeddings):
    x_test_features_concat.append(np.concatenate((t, e), axis=0))

# Word_Vector + Emoji_Vector
x_test_features_sum = []
for t, e in zip(test_scaled, test_emo_embeddings):
    x_test_features_sum.append(t + e)

In [None]:
svm = SVC(class_weight='balanced')
svm.fit(x_train_features_sum, y_train)

pred_train = svm.predict(x_train_features_sum)
helper.print_train_scores(y_train, pred_train)
print('*'*50)


pred_test = svm.predict(x_test_features_sum)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)

In [None]:
svm = SVC(class_weight='balanced')
svm.fit(x_train_features_concat, y_train)

pred_train = svm.predict(x_train_features_concat)
helper.print_train_scores(y_train, pred_train)
print('*'*50)


pred_test = svm.predict(x_test_features_concat)

#Calculating and printing the scores 
helper.print_test_scores(y_test, pred_test)

#Ploting the confusion matrix
helper.print_confusion_matrix(y_test, pred_test)