In [1]:
import numpy as np
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from string import punctuation
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn.naive_bayes as naive_bayes
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivamarora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shivamarora/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('data.csv')
data = data.dropna()
print(data.shape)
data.head()

(17662, 3)


Unnamed: 0,text,speaker,label
0,Says the Annies List political group supports ...,dwayne-bohac,false
1,When did the decline of coal start? It started...,scott-surovell,half-true
2,The economic turnaround started at the end of ...,charlie-crist,half-true
3,The Chicago Bears have had more starting quart...,robin-vos,true
4,Jim Dunnam has not lived in the district he re...,republican-party-texas,barely-true


In [3]:
data['label'].value_counts()

half-true      3673
false          3464
mostly-true    3320
barely-true    2990
true           2696
pants-fire     1519
Name: label, dtype: int64

In [4]:
len(data['speaker'].unique())

5105

## Utility Functions

In [5]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in punctuation])
    return punctuationfree

In [6]:
def tokenization(text):
    return word_tokenize(str(text))

In [7]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopWords]
    return output

In [8]:
porter_stemmer = PorterStemmer()
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

In [9]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

## Preprocessing

In [10]:
data.rename(columns = {'text':'content'}, inplace = True)
data.head()

Unnamed: 0,content,speaker,label
0,Says the Annies List political group supports ...,dwayne-bohac,false
1,When did the decline of coal start? It started...,scott-surovell,half-true
2,The economic turnaround started at the end of ...,charlie-crist,half-true
3,The Chicago Bears have had more starting quart...,robin-vos,true
4,Jim Dunnam has not lived in the district he re...,republican-party-texas,barely-true


In [11]:
#Punctuation Removal
data['content']= data['content'].apply(lambda x:remove_punctuation(x))

In [12]:
#Numbers Removal
remove_digits = str.maketrans('', '', digits)
data['content'] = data['content'].apply(lambda x: x.translate(remove_digits))

In [13]:
#lowering the text
data['content']= data['content'].apply(lambda x: x.lower())

In [14]:
#tokenization
data['content']= data['content'].apply(lambda x: tokenization(x))

In [15]:
#Removing stop words
data['content']= data['content'].apply(lambda x:remove_stopwords(x))

In [16]:
#Stemming
data['content']=data['content'].apply(lambda x: stemming(x))

In [17]:
#Lemmatization
data['content']=data['content'].apply(lambda x:lemmatizer(x))

In [18]:
data.head()

Unnamed: 0,content,speaker,label
0,"[say, anni, list, polit, group, support, third...",dwayne-bohac,false
1,"[declin, coal, start, start, natur, ga, took, ...",scott-surovell,half-true
2,"[econom, turnaround, start, end, term]",charlie-crist,half-true
3,"[chicago, bear, start, quarterback, last, year...",robin-vos,true
4,"[jim, dunnam, live, district, repres, year]",republican-party-texas,barely-true


In [19]:
label_map = {
    'pants-fire':0,
    'false':1,
    'barely-true':2,
    'half-true':3,
    'mostly-true':4,
    'true':5
}

In [20]:
data['label'] = data['label'].map(label_map)

In [21]:
data.head()

Unnamed: 0,content,speaker,label
0,"[say, anni, list, polit, group, support, third...",dwayne-bohac,-3
1,"[declin, coal, start, start, natur, ga, took, ...",scott-surovell,1
2,"[econom, turnaround, start, end, term]",charlie-crist,1
3,"[chicago, bear, start, quarterback, last, year...",robin-vos,5
4,"[jim, dunnam, live, district, repres, year]",republican-party-texas,-1


In [22]:
labelencoder = LabelEncoder()

In [23]:
data['speaker_code'] = labelencoder.fit_transform(data['speaker'])

In [24]:
data.head()

Unnamed: 0,content,speaker,label,speaker_code
0,"[say, anni, list, polit, group, support, third...",dwayne-bohac,-3,3497
1,"[declin, coal, start, start, natur, ga, took, ...",scott-surovell,1,4773
2,"[econom, turnaround, start, end, term]",charlie-crist,1,3210
3,"[chicago, bear, start, quarterback, last, year...",robin-vos,5,4686
4,"[jim, dunnam, live, district, repres, year]",republican-party-texas,-1,4620


## Split & Vectorization

In [25]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(data, target = 'label', 
                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

In [26]:
print(y_train.value_counts())

 1    2921
-3    2790
 3    2654
-1    2395
 5    2160
-5    1209
Name: label, dtype: int64


In [27]:
print(y_valid.value_counts())

 1    366
 3    335
-3    332
-1    311
 5    264
-5    158
Name: label, dtype: int64


In [28]:
print(y_test.value_counts())

 1    386
-3    342
 3    331
-1    284
 5    272
-5    152
Name: label, dtype: int64


In [29]:
def identity_tokenizer(text):
    return text
tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False, max_features=3500)
tfidf_vectorizer.fit(data['content'])

TfidfVectorizer(lowercase=False, max_features=3500, stop_words='english',
                tokenizer=<function identity_tokenizer at 0x7fc90ac3faf0>)

In [30]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train['content'])

In [31]:
tfidf_train_vectors

<14129x3500 sparse matrix of type '<class 'numpy.float64'>'
	with 126080 stored elements in Compressed Sparse Row format>

In [32]:
tfidf_valid_vectors = tfidf_vectorizer.fit_transform(X_valid['content'])

In [33]:
tfidf_valid_vectors

<1766x3500 sparse matrix of type '<class 'numpy.float64'>'
	with 16923 stored elements in Compressed Sparse Row format>

In [34]:
tfidf_test_vectors = tfidf_vectorizer.fit_transform(X_test['content'])

In [35]:
tfidf_test_vectors

<1767x3500 sparse matrix of type '<class 'numpy.float64'>'
	with 16489 stored elements in Compressed Sparse Row format>

## SVM Multiclass Classification

In [36]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [37]:
labels = [0,1,2,3,4,5]
def plot_cm(y_true,y_pred,labels):
  cm = confusion_matrix(y_true, y_pred)
  ax= plt.subplot()
  sns.heatmap(cm, annot=True, ax = ax, fmt="d")
  ax.set_title('Confusion Matrix')
  ax.set_xlabel('Predicted Labels')
  ax.set_ylabel('True Labels')
  ax.xaxis.set_ticklabels(labels)
  ax.yaxis.set_ticklabels(labels)
  plt.show()

In [38]:
clf = svm.SVC(kernel='linear')
clf.fit(tfidf_train_vectors,y_train)
y_pred_valid = clf.predict(tfidf_valid_vectors)
y_pred_test = clf.predict(tfidf_test_vectors)

print('Kernel: linear')
print('Validation')
print(classification_report(y_valid,y_pred_valid))
plot_cm(y_valid,y_pred_valid,labels)
print()
print('Testing')
print(classification_report(y_test,y_pred_test))
plot_cm(y_test,y_pred_test,labels)

KeyboardInterrupt: 

In [None]:
clf = svm.SVC(kernel='rbf')
clf.fit(tfidf_train_vectors,y_train)
y_pred_valid = clf.predict(tfidf_valid_vectors)
y_pred_test = clf.predict(tfidf_test_vectors)

print('Kernel: rbf')
print('Validation')
print(classification_report(y_valid,y_pred_valid))
plot_cm(y_valid,y_pred_valid,labels)
print()
print('Testing')
print(classification_report(y_test,y_pred_test))
plot_cm(y_test,y_pred_test,labels)

In [None]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(tfidf_train_vectors,y_train)
y_pred_valid = clf.predict(tfidf_valid_vectors)
y_pred_test = clf.predict(tfidf_test_vectors)

print('Kernel: sigmoid')
print('Validation')
print(classification_report(y_valid,y_pred_valid))
plot_cm(y_valid,y_pred_valid,labels)
print()
print('Testing')
print(classification_report(y_test,y_pred_test))
plot_cm(y_test,y_pred_test,labels)

## LSTM & BiLSTM

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, Input, Concatenate, TimeDistributed
from tensorflow.keras.utils import plot_model

In [None]:
def plot_history(history):
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

### LSTM - TEXT ONLY

In [None]:
model = Sequential()
model.add(Embedding(input_dim=3500, input_length=3500, output_dim=6))
model.add(Dropout(rate=0.4))
model.add(LSTM(units=4))
model.add(Dropout(rate=0.4))
model.add(Dense(units=100,  activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(units=6, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
# history = model.fit(
#     tfidf_train_vectors.toarray(), 
#     y_train, 
#     validation_data=[tfidf_valid_vectors.toarray(), y_valid],
#     epochs = 5
# )