In [41]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [20]:
SEED = 42
np.random.seed(SEED)

TARGET = "AI"
DATASET = "dataset.pickle"

In [21]:
df = pd.read_pickle(DATASET)
df

Unnamed: 0,crunchbase_ID,home_text,aboutus_text,overview_text,whatwedo_text,company_text,whoweare_text,AI
0,1916,Skip to main content Products GPU accelerated ...,,,,,,1
1,1917,Our AIs Research Company Careers Get in Touch ...,,,,Our AIs Research Company Careers Get in Touch ...,,1
2,1918,Toggle navigation Product Projects Company His...,,,,,,1
3,1919,Brainpeek Solutions Create a seamless online u...,Brainpeek Solutions Create a seamless online u...,,,,,1
4,1920,The Tool Our Languages Services Extract Produc...,The Tool Our Languages Services Extract Produc...,,,,,1
...,...,...,...,...,...,...,...,...
4889,2735,Username or Email L senord Remember me Norsk S...,Username or Email L senord Remember me Norsk S...,,,,,0
4890,5944,Solutions Solution for distributors Covered re...,,,,,,0
4891,5251,BROWSE PRODUCTS Variety Cases Pasta Mac and Ch...,,,,,,0
4892,4225,Pricing Documentation Community Changelog Logi...,,,,,,0


In [22]:
X = df['home_text'].values.astype(str)
y = df[TARGET].values

In [32]:
%%time

tokens = []
tokens = [word_tokenize(str(sent)) for sent in X]

Wall time: 31.1 s


In [79]:
stop_words = set(stopwords.words('english'))
filterd_words = []
for words in tokens:
    x = [word for word in words if word not in stop_words] #if word not in stopwords]   
    filterd_words.append(x)   

In [80]:
    
# taking the root of every word
lemmatized = []
for words in filterd_words:
    x = [WordNetLemmatizer().lemmatize(word) for word in words]
    lemmatized.append(x)

In [81]:
# taking words that are bigger than 2    
filterd = []
for words in lemmatized:
    x = [word for word in words if len(word) >= 2]
    filterd.append(x)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(filterd, y, test_size=0.1, random_state=SEED)

In [84]:
input_length = 35

def preprocess(X, tokenizer=None, padded=True):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>', lower=False)
        tokenizer.fit_on_texts(X)
        seq = tokenizer.texts_to_sequences(X)
        tmp = seq
        seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
        return tokenizer, seq_padded
    seq = tokenizer.texts_to_sequences(X)
    seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
    return seq_padded

In [85]:
tokenizer, X_train_processed  = preprocess(X_train)
X_val_processed = preprocess(X_val, tokenizer)

input_dim = len(tokenizer.word_index)+1
output_dim = 100

In [86]:
embedding_dict = {}
word_index = tokenizer.word_index
with open('glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors

max_words = input_dim
embedding_dims = output_dim

embedding_matrix = np.zeros((max_words, embedding_dims))
for word, i in word_index.items():
    if i > max_words:
        continue
        
    emb_vec = embedding_dict.get(word)    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec  

In [87]:
def make_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim, output_dim, input_length=input_length, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)),
        tf.keras.layers.LSTM(20, activation='tanh', return_sequences=False),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


def train_model(model, verbose=0):
    history = model.fit(X_train_processed, y_train, validation_data=(X_val_processed, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=verbose)
    return history

In [88]:
%%time

EPOCHS = 20
BATCH_SIZE = 64
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
]
model = make_model()
summary = train_model(model, 1)

Train on 4404 samples, validate on 979 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Wall time: 2min 4s


In [89]:
model.evaluate(X_val_processed, y_val)



[0.6843257069138694, 0.84167516]