# importing important libraries

In [7]:

from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.util import ngrams
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import contractions
import yake
from rake_nltk import Rake
from unidecode import unidecode
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer

In [55]:
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer

# Loading the dataset

In [8]:
import json
data = [json.loads(line) for 
        line in open("Sarcasm_Headlines_Dataset.json", 'r')]

In [10]:
new_df = pd.DataFrame.from_dict(data) 
data = new_df.drop(['article_link','is_sarcastic'],axis=1)

In [28]:
data.shape

(26709, 1)

In [29]:
data = data.sample(n=5000)

In [30]:
data.head()

Unnamed: 0,headline
14196,memphis airport panda express takes over as na...
8062,general mills releases new lucky charms with 1...
3158,ufc champion jon jones sentenced in hit-and-ru...
17053,israeli soldiers open fire on palestinians car...
10148,seven-year-old told to take it like a man


In [31]:
# preprocessing 
# 1. remove spaces,newlines
def remove_spaces(data):
    clean_text = data.replace('\\n',' ').replace("\t",' ').replace('\\',' ')
    return clean_text

# 2. contraction mapping
def expand_text(data):
    expanded_text = contractions.fix(data)
    return expanded_text

# 3.handling accented character
def handling_accented(data):
    fixed_text = unidecode(data)
    return fixed_text

# 4. Cleaning 
stopword_list = stopwords.words("english")
stopword_list.remove('no')
stopword_list.remove('nor')
stopword_list.remove('not')

def clean_data(data):
    tokens = word_tokenize(data)
    clean_text = [word.lower() for word in tokens if (word not in punctuation) and(word.lower() not in stopword_list) and(len(word)>2) and (word.isalpha())]
    return clean_text                   # and(word.lower() not in stopword_list) and(len(word)>2) and (word.isalpha())]

# 5.autocorrect 
def autocorrection(data):
    spell = Speller(lang='en')
    corrected_text = spell(data)
    return corrected_text

# 6. lemmatization
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_data = []
    for word in data :
        lemmatized_word = lemmatizer.lemmatize(word)
        final_data.append(lemmatized_word)
    return " ".join(final_data)

In [32]:
clean_text_train = data.headline.apply(remove_spaces)

clean_text_train = clean_text_train.apply(expand_text)

clean_text_train = clean_text_train.apply(handling_accented)

clean_text_train = clean_text_train.apply(clean_data)

clean_text_train = clean_text_train.apply(lemmatization)

# Text Vectorization

### Count vectorizer

In [33]:

# Count Vectorizer
count_vect = CountVectorizer()
bow = count_vect.fit_transform(clean_text_train).A
pd.DataFrame(bow,columns=count_vect.get_feature_names_out())

Unnamed: 0,aaron,abandon,abandoned,abandoning,abbey,abbi,abby,abc,abdul,ability,...,ziyi,zodiac,zohan,zone,zoo,zookeeper,zoolander,zoologist,zsa,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TFIDF

In [34]:
#TFIDF
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(clean_text_train).A
pd.DataFrame(tfidf,columns=tfidf_vect.get_feature_names_out())

Unnamed: 0,aaron,abandon,abandoned,abandoning,abbey,abbi,abby,abc,abdul,ability,...,ziyi,zodiac,zohan,zone,zoo,zookeeper,zoolander,zoologist,zsa,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### word2vec

In [35]:
# word2vec
sent = clean_text_train.tolist()
splitted_sent = [sen.split() for sen in sent]
splitted_sent

[['memphis',
  'airport',
  'panda',
  'express',
  'take',
  'nation',
  'depressing',
  'place'],
 ['general',
  'mill',
  'release',
  'new',
  'lucky',
  'charm',
  'percent',
  'le',
  'leprechaun',
  'meat'],
 ['ufc',
  'champion',
  'jon',
  'jones',
  'sentenced',
  'case',
  'involving',
  'pregnant',
  'woman'],
 ['israeli',
  'soldier',
  'open',
  'fire',
  'palestinian',
  'carrying',
  'potentially',
  'dangerous',
  'injured',
  'friend'],
 ['told', 'take', 'like', 'man'],
 ['funniest', 'tweet', 'woman', 'week'],
 ['neglect', 'wife', 'child', 'result', 'promotion'],
 ['fall', 'look', 'like', 'sign'],
 ['man', 'gym', 'locker', 'room', 'put', 'shirt', 'underwear'],
 ['boyfriend', 'really', 'envision', 'losing', 'sense', 'self', 'one'],
 ['trump', 'dangerous', 'move', 'towards', 'protectionism'],
 ['america', 'demonizes', 'teacher'],
 ['iggy', 'azalea', 'land', 'small', 'movie', 'role'],
 ['chipotle',
  'hire',
  'former',
  'critic',
  'help',
  'improve',
  'chain',
  'fo

In [36]:
word_2vec_model = Word2Vec(splitted_sent,min_count=2,window=3)

In [37]:
word_2vec_model.save('word2vec.model')
word_2vec_model.vector_size

100

In [38]:
# document numerical format
def vectorizer(list_of_docs,model):
    feature = []
    for rew in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for word in rew:
            try:
                word in model.wv 
                vectors.append(model.wv[word])#model.wv['bad']
            except KeyError:
                continue
    if vectors:
        vectors = np.asarray(vectors)
        avg_vec = vectors.mean(axis=0)
        feature.append(avg_vec)
    else:
        feature.append(zero_vector)
    return feature

In [39]:
vectorized_docs = vectorizer(splitted_sent,word_2vec_model)
vectorized_docs

[array([-5.20958798e-03,  4.69800318e-03,  1.78761489e-03,  2.56987335e-03,
         2.49576918e-03, -7.75398314e-03,  2.81614158e-03,  1.04554035e-02,
        -3.19552462e-04, -5.99302864e-03, -2.75700376e-03, -1.03089623e-02,
        -5.22022019e-04, -3.23750719e-04,  6.46232918e-04, -3.58266011e-03,
         3.88979074e-03, -4.69944812e-03,  1.51205191e-03, -5.99317346e-03,
         2.25338526e-03,  3.64061329e-03,  1.70205650e-03,  2.47086887e-03,
         4.70642000e-04,  7.02669611e-04, -2.64352839e-03,  1.06256302e-04,
        -3.58591392e-03,  1.87619589e-03,  8.63751676e-03,  1.52079808e-03,
         1.52176086e-04, -3.65224062e-03, -1.63634180e-03,  4.90144547e-03,
        -8.58253625e-05, -4.48718527e-03, -5.84766828e-03, -1.11955544e-02,
         2.77514779e-03, -3.21214367e-03, -4.23816545e-03, -1.64298178e-03,
         6.19834242e-03,  9.86052211e-04, -8.29288363e-03,  2.82645319e-03,
         3.29622463e-03,  4.01743175e-03,  4.40073945e-03, -3.36910319e-03,
         4.0

In [40]:
x_emb = np.array(vectorized_docs)
x_emb

array([[-5.20958798e-03,  4.69800318e-03,  1.78761489e-03,
         2.56987335e-03,  2.49576918e-03, -7.75398314e-03,
         2.81614158e-03,  1.04554035e-02, -3.19552462e-04,
        -5.99302864e-03, -2.75700376e-03, -1.03089623e-02,
        -5.22022019e-04, -3.23750719e-04,  6.46232918e-04,
        -3.58266011e-03,  3.88979074e-03, -4.69944812e-03,
         1.51205191e-03, -5.99317346e-03,  2.25338526e-03,
         3.64061329e-03,  1.70205650e-03,  2.47086887e-03,
         4.70642000e-04,  7.02669611e-04, -2.64352839e-03,
         1.06256302e-04, -3.58591392e-03,  1.87619589e-03,
         8.63751676e-03,  1.52079808e-03,  1.52176086e-04,
        -3.65224062e-03, -1.63634180e-03,  4.90144547e-03,
        -8.58253625e-05, -4.48718527e-03, -5.84766828e-03,
        -1.11955544e-02,  2.77514779e-03, -3.21214367e-03,
        -4.23816545e-03, -1.64298178e-03,  6.19834242e-03,
         9.86052211e-04, -8.29288363e-03,  2.82645319e-03,
         3.29622463e-03,  4.01743175e-03,  4.40073945e-0

# Building the model

In [41]:
# build kmeans
def build_kmeans(clusters,data):
    kmeans_model = KMeans(n_clusters=clusters)
    y_pred = kmeans_model.fit_predict(data)
    return kmeans_model,y_pred

In [42]:
bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
# kmeans - count vectorizer
kmeans_model_count, count_pred = build_kmeans(3,bow)

In [44]:
count_pred

array([1, 1, 1, ..., 1, 1, 0])

In [45]:
# kmeans-Tfidf
kmeans_model_tfidf,tfidf_pred = build_kmeans(3,tfidf)

In [46]:
tfidf_pred

array([0, 1, 0, ..., 0, 0, 0])

In [50]:
print(f"Silhouette score with kmeans-count : {silhouette_score(bow,count_pred)}")

print(f"Silhouette score with kmeans-tfidf : {silhouette_score(tfidf,tfidf_pred)}")

#print(f"Silhouette score with kmeans-word2vec : {silhouette_score(x_emb,word2vec_pred)}")

Silhouette score with kmeans-count : 0.03118398637696268
Silhouette score with kmeans-tfidf : 0.0025101773475816914


In [52]:
def visulize_silhouette(data,model,title1):
    print(f"Silhouette Visualizer for {title1}")
    visualizer = SilhouetteVisualizer(model,colors='yellowbrick')
    visualizer.fit(data)
    visualizer.show()
