<a href="https://colab.research.google.com/github/Justmileris/TextBinaryClassification/blob/master/TextBinaryClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import numpy as np
import pandas as pd
from chardet import detect
import os

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import re

import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn import metrics

import keras
from keras import regularizers
from keras.layers import Dropout
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras import layers

import time

In [0]:
# os.getcwd()
%cd /content/gdrive/My Drive/TextAnalysis

In [0]:
os.listdir()

In [0]:
np.random.seed(42)

# Read data

In [0]:
filepath_pos = 'rt-polarity.pos'
filepath_neg = 'rt-polarity.neg'

def get_encoding(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

data_pos = pd.read_csv('rt-polarity.pos', sep="\n", header=None, encoding = get_encoding(filepath_pos), names=['Sentence'])
data_neg = pd.read_csv('rt-polarity.neg', sep="\n", header=None, encoding = get_encoding(filepath_neg), names=['Sentence'])

In [0]:
print(len(data_pos))
print(data_pos.head())

In [0]:
print(len(data_neg))
print(data_neg.head())

# Labels

In [0]:
# Add True/False label column
data_pos['Label'] = 1
data_neg['Label'] = 0

In [0]:
# Combined data
whole_data = pd.concat([data_pos, data_neg])
whole_data = whole_data.reset_index(drop=True)
print(whole_data.keys())
print(len(whole_data))

In [0]:
print(whole_data.info())
print()
print(whole_data.Label.value_counts())

# Tokenization

In [0]:
# Corpus will be broken into set of words
whole_data['Splitted_sentence'] = [word_tokenize(sentence) for sentence in whole_data['Sentence']]

In [0]:
print(type(whole_data))
print(len(whole_data))
print(whole_data['Splitted_sentence'][0])
print(whole_data['Splitted_sentence'])

# Lemmatisation

In [0]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index, sentence in enumerate(whole_data['Splitted_sentence']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(sentence):
        # if word not in stopwords.words('english') and word.isalpha():
        word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
        Final_words.append(word_Final)
    whole_data.loc[index,'Tokens'] = str(Final_words)

# Data split

In [0]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(whole_data['Tokens'],whole_data['Label'],test_size=0.3, random_state=42, shuffle=True)

# Vectorization

In [0]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    mytokens = [ word for word in mytokens if word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [0]:
# vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0) # 77.71
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0) # 78.37

vectorizer.fit(whole_data['Sentence'])

Train_X_vectorized = vectorizer.transform(Train_X)
Test_X_vectorized = vectorizer.transform(Test_X)

In [0]:
print(len(vectorizer.vocabulary_))
print(vectorizer.vocabulary_)

# Naive Bayes

In [0]:
# Naive Bayes classifier
classifier = naive_bayes.MultinomialNB()
classifier.fit(Train_X_vectorized, Train_Y)
prediction_NB = classifier.predict(Test_X_vectorized)
print("Naive Bayes Accuracy Score: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))

# SVM

In [0]:
# SVM classifier
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
classifier.fit(Train_X_vectorized,Train_Y)
prediction_SVM = classifier.predict(Test_X_vectorized)
print("SVM Accuracy Score: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))

# Logistic Regression

In [0]:
# Logistic Regression classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=2000)
classifier.fit(Train_X_vectorized,Train_Y)
prediction_logreg = classifier.predict(Test_X_vectorized)
print("Logistic Regression Accuracy Score: {}%".format(round(accuracy_score(prediction_logreg, Test_Y)*100, 2)))

# Random Forest

In [0]:
# Random Forest
classifier = RandomForestClassifier(n_estimators=10, random_state=42, verbose=3) # Add verbose=3 (more than 1) to see progress
classifier.fit(Train_X_vectorized,Train_Y)
prediction_randomforest = classifier.predict(Test_X_vectorized)
print("Random Forest Accuracy Score: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

# Accuracies

In [0]:
print("Naive Bayes: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))
print(confusion_matrix(Test_Y, prediction_NB))
print()
print("SVM: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))
print("Logistic Regression: {}%".format(round(accuracy_score(prediction_logreg, Test_Y)*100, 2)))
print("Random Forest: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

# Neural Networks, Keras ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [0]:
X_train = Train_X_vectorized
X_test = Test_X_vectorized
y_train = Train_Y
y_test = Test_Y

Training

In [0]:
plt.style.use('ggplot')
epochs_count = 20

input_dim = X_train.shape[1]  # Number of features
model = Sequential()


model.add(layers.Dense(4, input_dim=input_dim, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dropout(0.1))
model.add(layers.Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=epochs_count,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=32)


def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()


loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Save model

In [0]:
# This saves evrything (weight values, model's configuration(architecture), optimizer configuration)
model_file_name = 'acc_' + str(int(round(accuracy*100))) + '_____epchs_' + str(epochs_count) + '_____' + str(int(time.time())) + '.h5'
print(model_file_name)
model.save(model_file_name)

Upload model

In [0]:
# Recreate the exact same model, including weights and optimizer.
new_model = keras.models.load_model('acc_76_____epchs_20_____1572482736.h5')
new_model.summary()

In [0]:
loss, acc = new_model.evaluate(X_test, y_test)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

# EXTRA |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

# spaCy visualizations

In [0]:
# Token word attributes

print(whole_data['Sentence'][1])
print()

doc = nlp(whole_data['Sentence'][1])
print('text\tidx\tlemma\tis_pnct\tis_spc\tshape\tpos\ttag')
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

In [0]:
# Entitites

from spacy import displacy
 
print(whole_data['Sentence'][2005])
doc = nlp(whole_data['Sentence'][2005]) # 2005, 3496, 7533
displacy.render(doc, style='ent', jupyter=True)

In [0]:
# Dependencies

from spacy import displacy
index=0
print(whole_data['Sentence'][index])
doc = nlp(whole_data['Sentence'][index])

displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

In [0]:
word = whole_data['Sentence'][0]
print(word)

nlp(word).vector