In [9]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
SEED = 42
np.random.seed(SEED)

TARGET = "AI"
DATASET = "dataset.pickle"

In [3]:
df = pd.read_pickle(DATASET)
df

Unnamed: 0,crunchbase_ID,home_text,aboutus_text,overview_text,whatwedo_text,company_text,whoweare_text,AI
0,1916,Skip to main content Products GPU accelerated ...,,,,,,1
1,1917,Our AIs Research Company Careers Get in Touch ...,,,,Our AIs Research Company Careers Get in Touch ...,,1
2,1918,Toggle navigation Product Projects Company His...,,,,,,1
3,1919,Brainpeek Solutions Create a seamless online u...,Brainpeek Solutions Create a seamless online u...,,,,,1
4,1920,The Tool Our Languages Services Extract Produc...,The Tool Our Languages Services Extract Produc...,,,,,1
...,...,...,...,...,...,...,...,...
4889,2735,Username or Email L senord Remember me Norsk S...,Username or Email L senord Remember me Norsk S...,,,,,0
4890,5944,Solutions Solution for distributors Covered re...,,,,,,0
4891,5251,BROWSE PRODUCTS Variety Cases Pasta Mac and Ch...,,,,,,0
4892,4225,Pricing Documentation Community Changelog Logi...,,,,,,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4894 entries, 0 to 4893
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   crunchbase_ID  4894 non-null   object
 1   home_text      4894 non-null   object
 2   aboutus_text   2212 non-null   object
 3   overview_text  66 non-null     object
 4   whatwedo_text  50 non-null     object
 5   company_text   477 non-null    object
 6   whoweare_text  83 non-null     object
 7   AI             4894 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 306.0+ KB


In [5]:
df.isnull().mean()

crunchbase_ID    0.000000
home_text        0.000000
aboutus_text     0.548018
overview_text    0.986514
whatwedo_text    0.989783
company_text     0.902534
whoweare_text    0.983040
AI               0.000000
dtype: float64

In [6]:
null_vars = [var for var in df.columns if df[var].isnull().sum() > 0]
df[null_vars+["AI"]]

Unnamed: 0,aboutus_text,overview_text,whatwedo_text,company_text,whoweare_text,AI
0,,,,,,1
1,,,,Our AIs Research Company Careers Get in Touch ...,,1
2,,,,,,1
3,Brainpeek Solutions Create a seamless online u...,,,,,1
4,The Tool Our Languages Services Extract Produc...,,,,,1
...,...,...,...,...,...,...
4889,Username or Email L senord Remember me Norsk S...,,,,,0
4890,,,,,,0
4891,,,,,,0
4892,,,,,,0


In [13]:
X = df['home_text'].values.astype(str)
y = df[TARGET].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [14]:
from nltk.tokenize import word_tokenize

def create_corpus(texts):
    """Decompose text to corpus (e.g. `This is a pen` to [ `This`, `is`, `a`, `pen` ])
    
    Arguments:
        texts: list(str) / Text list.
        
    Returns:
        list(str) / Corpus list.
    """
    
    corpus = []
    for tweet in texts:
        words = [ word.lower() for word in word_tokenize(tweet) ]
        corpus.append(words)
        
    return corpus

X_train = create_corpus(X_train)
X_val = create_corpus(X_val)

In [15]:
input_length = 35

def preprocess(X, tokenizer=None, padded=True):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
        tokenizer.fit_on_texts(X)
        seq = tokenizer.texts_to_sequences(X)
        tmp = seq
        seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
        return tokenizer, seq_padded
    seq = tokenizer.texts_to_sequences(X)
    seq_padded = pad_sequences(seq, maxlen=input_length, padding='post', truncating='post')
    return seq_padded

In [17]:
tokenizer, X_train_processed  = preprocess(X_train)
X_val_processed = preprocess(X_val, tokenizer)

input_dim = len(tokenizer.word_index)+1
output_dim = 100

In [19]:
embedding_dict = {}
word_index = tokenizer.word_index
with open('glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors

max_words = input_dim
embedding_dims = output_dim

embedding_matrix = np.zeros((max_words, embedding_dims))
for word, i in word_index.items():
    if i > max_words:
        continue
        
    emb_vec = embedding_dict.get(word)    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec  