In [None]:
import pandas as pd
sar_acc = pd.read_csv('train-balanced-sarcasm.csv', nrows=200000)
sar_acc.dropna(subset=['comment'], inplace=True)
import re
sar_acc.head()

In [None]:
sar_acc['length'] = sar_acc['comment'].str.len()
sar_acc['num_words'] = sar_acc['comment'].apply(lambda x: len(str(x).split()))
sar_acc.drop(['author','subreddit','score','ups','downs','date','created_utc','parent_comment'],axis=1,inplace = True)
sar_acc.head()


In [None]:
sar_acc.info()

In [None]:
sar_acc['label'].value_counts()


In [None]:
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

sar_acc_tar = sar_acc['label'].value_counts()
labels = ['Acclaim', 'Sarcastic']
sizes = (np.array((sar_acc_tar / sar_acc_tar.sum())*100))
colors = ['light-blue', 'light-red']

trace = go.Pie(labels=labels, values=sizes, opacity = 0.8, hoverinfo='label+percent',
               marker=dict(colors=colors, line=dict(color='#FFFFFF', width=2)))
layout = go.Layout(
    title='Sarcastic Vs Acclaim'
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="Sa_Ac")

In [None]:
sar_list = sar_acc[sar_acc.label==1]
sar_list.reset_index(drop=True, inplace=True)
acc_list = sar_acc[sar_acc.label==0]
acc_list.reset_index(drop=True, inplace=True)
sar_comments = []
for rows in range(0,sar_list.shape[0]):
    head_txt = sar_list.comment[rows]
    head_txt = head_txt.split(" ")
    sar_comments.append(head_txt)
    
import itertools
sar_list = list(itertools.chain(*sar_comments))

acc_comments = []
for rows in range(0,acc_list.shape[0]):
    head_txt = acc_list.comment[rows]
    head_txt = head_txt.split(" ")
    acc_comments.append(head_txt)
    
acc_list = list(itertools.chain(*acc_comments))

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
sar_list_restp = [word for word in sar_list if word.lower() not in stopwords]
acc_list_restp = [word for word in acc_list if word.lower() not in stopwords]
print("Length of original Sarcasm list: {0} words\n"
      "Length of Sarcasm list after stopwords removal: {1} words"
      .format(len(sar_list), len(sar_list_restp)))

print("=="*40)

print("Length of original Acclaim list: {0} words\n"
      "Length of Acclaim list after stopwords removal: {1} words"
      .format(len(acc_list), len(acc_list_restp)))

In [None]:
all_words = sar_acc['comment'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(x=all_words.index.values[2:50], 
               y=all_words.values[2:50],
               marker = dict(colorscale='Viridis',
                             color = all_words.values[2:100]),
               text = 'Word counts')]
layout = go.Layout(title = 'Frequent occuring words in the comments')
fig = go.Figure(data = data, layout=layout)
iplot(fig, filename='basic-bar')               

In [None]:
from collections import Counter
sar_cntr = Counter(sar_list_restp)
acc_cntr = Counter(acc_list_restp)

sar_cntr_df = pd.DataFrame(list(sar_cntr.items()), columns = ['Words','Freq'])
sar_cntr_df = sar_cntr_df.sort_values(by=['Freq'], ascending = False)
acc_cntr_df = pd.DataFrame(list(acc_cntr.items()), columns = ['Words','Freq'])
acc_cntr_df = acc_cntr_df.sort_values(by=['Freq'], ascending = False)

sar_cntr_df_50 = sar_cntr_df.head(50)
acc_cntr_df_50 = acc_cntr_df.head(50)

In [None]:
#Plotting the top 50 Sarcasm Vs Acclaim
from plotly import tools
sar_tr  = go.Bar(
    x=sar_cntr_df_50['Freq'],
    y=sar_cntr_df_50['Words'],
    name='Sarcasm',
    marker=dict(
        color='rgba(155, 89, 182, 0.6)',
        line=dict(
            color='rgba(155, 89, 182, 1.0)',
            width=.3,
        )
    ),
    orientation='h',
    opacity=0.6
)
acc_tr  = go.Bar(
    x=acc_cntr_df_50['Freq'],
    y=acc_cntr_df_50['Words'],
    name='Acclaim',
    marker=dict(
        color='rgba(88, 214, 141, 0.6)',
        line=dict(
            color='rgba(88, 214, 141, 1.0)',
            width=.3,
        )
    ),
    orientation='h',
    opacity=0.6
)

fig = tools.make_subplots(rows=2, cols=1, subplot_titles=('Top 50 Most occuring words in Sarcastic Comments',
                                                          'Top 50 Most occuring words in Acclaim Comments'))

fig.append_trace(sar_tr, 1, 1)
fig.append_trace(acc_tr, 2, 1)


fig['layout'].update(height=1200, width=800)

iplot(fig, filename='sar_vs_acc')

In [None]:
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
sar_wost_lem = []
for batch in sar_comments:
    sar_list_restp = [word for word in batch if word.lower() not in stopwords]
    sar_list_lemm = [lemm.lemmatize(word) for word in sar_list_restp]
    sar_wost_lem.append(sar_list_lemm)
acc_wost_lem = []
for batch in acc_comments:
    acc_list_restp = [word for word in batch if word.lower() not in stopwords]
    acc_list_lemm = [lemm.lemmatize(word) for word in acc_list_restp]
    acc_wost_lem.append(acc_list_lemm)


In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
sar_list_wd = list(itertools.chain(*sar_wost_lem))
from wordcloud import WordCloud
sar_cloud = WordCloud(background_color='black', width = 5000, height = 2000).generate(" ".join(sar_list_wd))
plt.imshow(sar_cloud)
plt.axis('off')
plt.show()

In [None]:
acc_list_wd = list(itertools.chain(*acc_wost_lem))
acc_cloud = WordCloud(background_color='black',width = 2000, height = 1000).generate(" ".join(acc_list_wd))
plt.imshow(acc_cloud)
plt.axis('off')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
X = sar_acc.comment.astype('str')
X.str.lower()
Y = sar_acc.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [None]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
max_words = 1000
max_len = 300
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)

sequences = tok.texts_to_sequences(X_train)
sequence_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
from keras.models import Model
from keras.optimizers import RMSprop
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model1 = RNN()
model1.summary()
model1.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

history = model1.fit(sequence_matrix,Y_train,batch_size=200,epochs=5,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,5))
fig.suptitle("Performance of RNN Model with RELU activation, Binary crossentropy and RMSprop")
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
vline_cut = np.where(history.history['val_acc'] == np.max(history.history['val_acc']))[0][0]
ax1.axvline(x=vline_cut, color='k', linestyle='--')
ax1.set_title("Model Accuracy")
ax1.legend(['train', 'test'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
vline_cut = np.where(history.history['val_loss'] == np.min(history.history['val_loss']))[0][0]
ax2.axvline(x=vline_cut, color='k', linestyle='--')
ax2.set_title("Model Loss")
ax2.legend(['train', 'test'])
plt.show()

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)

In [None]:
accr = model1.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
model2 = RNN()
model2.summary()
model2.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model2.fit(sequence_matrix,Y_train,batch_size=200,epochs=5,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,5))
fig.suptitle("Performance of RNN Model with RELU activation, Adam optimization and Mean Squared Error")
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
vline_cut = np.where(history.history['val_acc'] == np.max(history.history['val_acc']))[0][0]
ax1.axvline(x=vline_cut, color='k', linestyle='--')
ax1.set_title("Model Accuracy")
ax1.legend(['train', 'test'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
vline_cut = np.where(history.history['val_loss'] == np.min(history.history['val_loss']))[0][0]
ax2.axvline(x=vline_cut, color='k', linestyle='--')
ax2.set_title("Model Loss")
ax2.legend(['train', 'test'])
plt.show()

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)

In [None]:
accr = model2.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
def RNNwithSelu():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('selu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Activation('selu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model3 = RNNwithSelu()
model3.summary()
model3.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [None]:
history = model3.fit(sequence_matrix,Y_train,batch_size=200,epochs=5,
          validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,5))
fig.suptitle("Performance of RNN Model with Selu activation, Binary crossentropy and RMSprop")
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
vline_cut = np.where(history.history['val_acc'] == np.max(history.history['val_acc']))[0][0]
ax1.axvline(x=vline_cut, color='k', linestyle='--')
ax1.set_title("Model Accuracy")
ax1.legend(['train', 'test'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
vline_cut = np.where(history.history['val_loss'] == np.min(history.history['val_loss']))[0][0]
ax2.axvline(x=vline_cut, color='k', linestyle='--')
ax2.set_title("Model Loss")
ax2.legend(['train', 'test'])
plt.show()

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)

In [None]:
accr = model3.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

tf_idf = TfidfVectorizer(ngram_range=(1,2), max_features = 50000, min_df=2)
logit = LogisticRegression(random_state=17, n_jobs=4, C=1, verbose=True, solver='lbfgs')
model4 = Pipeline([('tf_idf', tf_idf),('logit',logit)])
train_texts, valid_texts, y_train, y_valid = \
        train_test_split(sar_acc['comment'], sar_acc['label'], random_state=17)
history = model4.fit(train_texts,y_train)

In [None]:
valid_pred = model4.predict(valid_texts)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_valid, valid_pred)
