In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import gensim
import re



In [2]:
#Our first approach to using Deep Learning to predict the author of a text will be to use a shallow neural network 
#with non-linear activation functions

In [3]:
df = pd.read_csv(r'Data/dataset.csv',names=['Author','Text'])

In [4]:
df['Text'] = df['Text'].map(lambda x: re.sub('\r|\n|\'','',x))
df['Text'] = df['Text'].map(lambda x: re.sub(r'--\d\d\d-\d\d\d-\d\d\d\d','',x))

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words_subset = set([x for x in stop_words if 3 <= len(x) <= 5])

In [6]:
text_copy = df['Text'].copy()

In [7]:
def remove_stopwords(x): #x is the string input
    word_set = x.split(' ')
    for word in word_set:
        if word in stop_words:
            word_set.remove(word)
    return ' '.join(word_set)
            


In [8]:
# Removing stopwords
df['Text'] = df['Text'].map(lambda x: remove_stopwords(x))

In [9]:
#Stratified train-test split 

X = df['Text']
y = df['Author']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [11]:
#1. Using sklearn's CountVectorizer(i.e. A Bag of words model) to get a feature vector for our article text

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.1, lowercase=False)

In [13]:
vectorizer.fit(df['Text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=0.1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
len(vectorizer.vocabulary_) #The vector has length of 444 i.e. the top 444 words in the corpus have been taken to make feature vectors out of

444

In [15]:
X_train = vectorizer.transform(X_train.values)

In [16]:
X_test = vectorizer.transform(X_test.values)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [18]:
input_dimension = X_train.shape[1] #This specifies the number of neurons in the input layer
#Using Keras' sequential API
model = Sequential()
model.add(layers.Dense(100, input_dim=input_dimension, activation='relu'))
model.add(layers.Dense(50, activation='softmax'))

In [19]:
model.compile(loss='binary_crossentropy', 
               optimizer='adam', 
            metrics=['accuracy'])

#Using Adam optimizer and the binary cross-entropy loss function

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               44500     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
Total params: 49,550
Trainable params: 49,550
Non-trainable params: 0
_________________________________________________________________


In [21]:
#The target column has string categorical values. Convert this to discrete numeric values using Keras' LabelEncoder
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(df['Author'].unique())

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [22]:
#encoder.transform(df['Author'])
author_copy = df['Author'].copy()

In [23]:
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [24]:
history = model.fit(X_train, y_train,
                     epochs=100,
                     verbose=False,
                     validation_data=(X_test, y_test),
                    batch_size=10)

In [34]:
#dir(history)
#history.model
history.params

{'batch_size': 10,
 'epochs': 100,
 'steps': None,
 'samples': 3750,
 'verbose': False,
 'do_validation': True,
 'metrics': ['loss', 'acc', 'val_loss', 'val_acc'],
 'validation_steps': None}

In [35]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)

In [36]:
print(f'On the training set: loss of {loss} and accuracy achieved: {accuracy}')

On the training set: loss of 1.2128507557160144e-07 and accuracy achieved: 1.0


In [90]:
#A validation set accuracy of 100% suggests that our model has overfitted. 
#The accuracy on the testing set comes out to be:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.9897


SHALLOW NEURAL NETWORK WITH RELU ACTIVATION AT THE HIDDEN LAYER AND SOFTMAX AT THE OUTPUT LAYER ACHIEVES 98.9% ACCURACY

In [38]:
#Trying a model with two hidden layers:

In [91]:
model_two = Sequential()
model_two.add(layers.Dense(100,input_dim = input_dimension, activation = 'relu'))
model_two.add(layers.Dense(75,activation = 'relu'))
model_two.add(layers.Dense(50, activation='softmax'))

In [92]:
model_two.compile(loss='binary_crossentropy', 
               optimizer='adam', 
            metrics=['accuracy'])


In [93]:
history = model_two.fit(X_train, y_train,
                     epochs=100,
                     verbose=False,
                     validation_data=(X_test, y_test),
                    batch_size=10)

In [94]:
model_two.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 100)               44500     
_________________________________________________________________
dense_6 (Dense)              (None, 75)                7575      
_________________________________________________________________
dense_7 (Dense)              (None, 50)                3800      
Total params: 55,875
Trainable params: 55,875
Non-trainable params: 0
_________________________________________________________________


In [95]:
model_two.evaluate(X_train,y_train)



[1.0225629803244374e-07, 1.0]

In [96]:
model_two.evaluate(X_test,y_test)



[0.08353356000483037, 0.9885600135803223]

In [49]:
X_train.toarray().shape

(3750, 444)

In [89]:
model_two.evaulate(X_test,y_test)

AttributeError: 'Sequential' object has no attribute 'evaulate'

In [11]:
#Part 1: Obtain word embedding representation of the articles. 

In [12]:
#Using gensim's pre-trained Word2Vec model for obtaining word-embeddings for the text data:
from gensim.models import Word2Vec


In [13]:
sentences = ' '.join(X_train)
more_sentences = ' '.join(X_test)
sentences = sentences + more_sentences

In [130]:
model = Word2Vec(sentences,min_count = 10,window = 3, size = 100) #Taking too long to train

142922

In [131]:
#model = gensim.models.KeyedVectors.load_word2vec_format("glove.6B.300d.txt", binary=False)
#Maybe use a pre-trained model

In [69]:
sample_inp = vectorizer.transform(df['Text'].iloc[1].split(' ')).toarray()


In [68]:
encoder.transform(encoder.classes_)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [70]:
dir(model_two)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activity_regularizer',
 '_add_inbound_node',
 '_add_unique_metric_name',
 '_add_variable_with_custom_getter',
 '_assert_input_compatibility',
 '_base_init',
 '_build_input_shape',
 '_cache_output_metric_attributes',
 '_call_and_compute_mask',
 '_call_convention',
 '_check_trainable_weights_consistency',
 '_checkpoint_dependencies',
 '_checkpointable_saver',
 '_collected_trainable_weights',
 '_compute_output_and_mask_jointly',
 '_compute_previous_mask',
 '_dataset_iterator_cache',
 '_deferred_dependencies',
 '_determine_call_convention',
 '_distribution_standardize_user_data',
 '_distribution_strategy',
 '_d

In [75]:
model_two.predict(sample_inp)

array([[0.21312012, 0.05426535, 0.00128296, ..., 0.10208776, 0.01495079,
        0.00639352],
       [0.18153667, 0.02038331, 0.00097499, ..., 0.07116063, 0.1217784 ,
        0.00437038],
       [0.21312012, 0.05426535, 0.00128296, ..., 0.10208776, 0.01495079,
        0.00639352],
       ...,
       [0.21312012, 0.05426535, 0.00128296, ..., 0.10208776, 0.01495079,
        0.00639352],
       [0.21312012, 0.05426535, 0.00128296, ..., 0.10208776, 0.01495079,
        0.00639352],
       [0.21312012, 0.05426535, 0.00128296, ..., 0.10208776, 0.01495079,
        0.00639352]], dtype=float32)

In [77]:
sample_inp.shape

(237, 444)

In [78]:
len(df['Text'].iloc[1])

1749

In [81]:
X_train.toarray().shape

(3750, 444)

In [87]:
X_train.toarray()[0,:]

array([ 0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  1,  1,  1,  0,  0,  0,
        2,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
        1,  0,  0,  0,  0,  0,  0,  3,  0,  0,  1,  1,  0,  0,  2,  0,  0,
        0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  1,  3,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  5,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  2,  2,  0,  1,  0,  0,  0,  0,  0,
        2,  0,  1,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  0,  1,  1,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  3,  0,  2,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,
        1,  0,  0,  0,  0

In [88]:
X_test.toarray().shape

(1250, 444)

In [97]:
#Check accuracy on a subset of the test set

In [99]:
X_test.toarray()

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [2, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 3, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [102]:
X_test.toarray().shape
arr = X_test.toarray()
inps =[arr[i,:] for i in range(10)]

In [106]:
model_two.predict(inps[0])

ValueError: Error when checking input: expected dense_5_input to have shape (444,) but got array with shape (1,)

In [108]:
model_two.predict(inps)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 10 arrays: [array([[ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
    ...