In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
SEED = 42
np.random.seed(SEED)

TARGET = "AI"
DATASET = "dataset.pickle"

In [3]:
df = pd.read_pickle(DATASET)
df

Unnamed: 0,crunchbase_ID,home_text,aboutus_text,overview_text,whatwedo_text,company_text,whoweare_text,AI
0,1916,Skip to main content Products GPU accelerated ...,,,,,,1
1,1917,Our AIs Research Company Careers Get in Touch ...,,,,Our AIs Research Company Careers Get in Touch ...,,1
2,1918,Toggle navigation Product Projects Company His...,,,,,,1
3,1919,Brainpeek Solutions Create a seamless online u...,Brainpeek Solutions Create a seamless online u...,,,,,1
4,1920,The Tool Our Languages Services Extract Produc...,The Tool Our Languages Services Extract Produc...,,,,,1
...,...,...,...,...,...,...,...,...
4889,2735,Username or Email L senord Remember me Norsk S...,Username or Email L senord Remember me Norsk S...,,,,,0
4890,5944,Solutions Solution for distributors Covered re...,,,,,,0
4891,5251,BROWSE PRODUCTS Variety Cases Pasta Mac and Ch...,,,,,,0
4892,4225,Pricing Documentation Community Changelog Logi...,,,,,,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4894 entries, 0 to 4893
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   crunchbase_ID  4894 non-null   object
 1   home_text      4894 non-null   object
 2   aboutus_text   2212 non-null   object
 3   overview_text  66 non-null     object
 4   whatwedo_text  50 non-null     object
 5   company_text   477 non-null    object
 6   whoweare_text  83 non-null     object
 7   AI             4894 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 306.0+ KB


In [5]:
df.isnull().mean()

crunchbase_ID    0.000000
home_text        0.000000
aboutus_text     0.548018
overview_text    0.986514
whatwedo_text    0.989783
company_text     0.902534
whoweare_text    0.983040
AI               0.000000
dtype: float64

In [6]:
df_home_text = df[['home_text', 'AI']].set_axis( ['text', 'AI'], axis=1)
df_aboutus_text = df[df['aboutus_text'].notnull()][['aboutus_text', 'AI']].set_axis( ['text', 'AI'], axis=1)
df_overview_text = df[df['overview_text'].notnull()][['overview_text', 'AI']].set_axis( ['text', 'AI'], axis=1)
df_whatwedo_text = df[df['whatwedo_text'].notnull()][['whatwedo_text', 'AI']].set_axis( ['text', 'AI'], axis=1)
df_company_text = df[df['company_text'].notnull()][['company_text', 'AI']].set_axis( ['text', 'AI'], axis=1)
df_whoweare_text = df[df['whoweare_text'].notnull()][['whoweare_text', 'AI']].set_axis( ['text', 'AI'], axis=1)

In [7]:
df = pd.concat([df_home_text, df_aboutus_text, df_overview_text, df_whatwedo_text, df_company_text, df_whoweare_text], axis=0, ignore_index=True)
df = df.sample(frac=1)

In [8]:
X = df['text'].values.astype(str)
y = df[TARGET].values

In [9]:
regs = []
for sent in X:
    x = re.sub('[^A-Za-z]'," ",str(sent))
#     x = re.split("\s",s)
    regs.append(x)

# removing all spaces
# for i in X:
#     while '' in i:
#         i.remove('')

In [10]:
from nltk.tokenize import word_tokenize

def create_corpus(texts):
    """Decompose text to corpus (e.g. `This is a pen` to [ `This`, `is`, `a`, `pen` ])
    
    Arguments:
        texts: list(str) / Text list.
        
    Returns:
        list(str) / Corpus list.
    """
    
    corpus = []
    for tweet in texts:
        words = [ word.lower() for word in word_tokenize(tweet) ]
        corpus.append(words)
        
    return corpus

tokens = create_corpus(regs)

In [11]:
stop_words = set(stopwords.words('english'))
filterd_words = []
for words in tokens:
    x = [word for word in words if word not in stop_words] #if word not in stopwords]   
    filterd_words.append(x)   

In [12]:
# taking the root of every word
lemmatized = []
for words in filterd_words:
    x = [WordNetLemmatizer().lemmatize(word) for word in words]
    lemmatized.append(x)

In [13]:
# taking words that are bigger than 2    
filterd = []
for words in lemmatized:
    x = [word for word in words if len(word) >= 2]
    filterd.append(x)
    
X = filterd

In [14]:
input_length = 50

def preprocess(X, tokenizer=None, padded=True):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
        tokenizer.fit_on_texts(X)
        seq = tokenizer.texts_to_sequences(X)
        tmp = seq
        seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
        return tokenizer, seq_padded
    seq = tokenizer.texts_to_sequences(X)
    seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
    return seq_padded

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=SEED)

In [16]:
tokenizer, X_train_processed  = preprocess(X_train)
X_val_processed = preprocess(X_val, tokenizer)

input_dim = len(tokenizer.word_index)+1
output_dim = 100

In [17]:
embedding_dict = {}
word_index = tokenizer.word_index
with open('glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors

max_words = input_dim
embedding_dims = output_dim

embedding_matrix = np.zeros((max_words, embedding_dims))
for word, i in word_index.items():
    if i > max_words:
        continue
        
    emb_vec = embedding_dict.get(word)    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec  

In [18]:
def make_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim, output_dim, input_length=input_length, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)),
        tf.keras.layers.LSTM(20, activation='tanh', return_sequences=False),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


def train_model(model, callbacks, verbose=0):
    history = model.fit(X_train_processed, y_train, validation_data=(X_val_processed, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=verbose)
    return history

In [19]:
def repeat_training(times=30):
    scores = pd.DataFrame(columns=['N', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])
    for i in range(1, times+1):
        model = make_model()

        model.compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["accuracy"])
    
        history = train_model(model, callbacks, 0)
        
        train_loss, train_acc = model.evaluate(X_train_processed, y_train, verbose=0)
        val_loss, val_acc = model.evaluate(X_val_processed, y_val, verbose=0)
        
        scores.loc[i, 'N'] = i
        scores.loc[i, 'train_loss'] = train_loss
        scores.loc[i, 'train_acc'] = train_acc
        scores.loc[i, 'val_loss'] = val_loss
        scores.loc[i, 'val_acc'] = val_acc
        
    scores['train_loss'] = scores['train_loss'].astype(float)
    scores['train_acc'] = scores['train_acc'].astype(float)
    scores['val_loss'] = scores['val_loss'].astype(float)
    scores['val_acc'] = scores['val_acc'].astype(float)
    return scores

In [20]:
%%time
checkpoint = tf.keras.callbacks.ModelCheckpoint('lstm_model.h5', monitor='val_accuracy', save_best_only=True)
early_stoping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
EPOCHS = 20
BATCH_SIZE = 64
callbacks = [early_stoping]
    
model = make_model()
summary =repeat_training(10)

Wall time: 28min 6s


In [21]:
summary.describe()

Unnamed: 0,train_loss,train_acc,val_loss,val_acc
count,10.0,10.0,10.0,10.0
mean,0.019413,0.994502,0.707115,0.845186
std,0.011881,0.002937,0.068768,0.007466
min,0.0106,0.987291,0.566508,0.834403
25%,0.011944,0.993289,0.685803,0.839538
50%,0.013529,0.996002,0.724931,0.844673
75%,0.020963,0.99643,0.745965,0.850128
max,0.046525,0.996573,0.812099,0.85751


In [22]:
model.evaluate(X_val_processed, y_val)



[0.6882692479811828, 0.55584085]