**WORD EMBEDDINGS AND TEXT CLASSIFICATION USING DEEP NEURAL NETWORKS - HANDS-ON**

---




## Prepare text

1.   Create a csv file with required data from the dataset




In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')     #train, test or all

In [None]:
print(newsgroups_train.DESCR)


In [None]:
newsgroups_train['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
from hashlib import new
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']
    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']
    out = pd.merge(df, targets, left_on='target', right_index=True)
    out.to_csv('20_newsgroup.csv')
    
twenty_newsgroup_to_csv()
data_df =  pd.read_csv('20_newsgroup.csv')
data_df

Unnamed: 0.1,Unnamed: 0,text,target,title
0,0,I was wondering if anyone out there could enli...,7,rec.autos
1,17,I recently posted an article asking what kind ...,7,rec.autos
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos
3,56,an excellent automatic can be found in the sub...,7,rec.autos
4,64,: Ford and his automobile. I need information...,7,rec.autos
...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt


In [3]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer

In [4]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    return " ".join(tokens)

data_df['cleanText']=data_df['text'].map(lambda s:preprocess(s))
data_df

Unnamed: 0.1,Unnamed: 0,text,target,title,cleanText
0,0,I was wondering if anyone out there could enli...,7,rec.autos,i was wondering if anyone out there could enli...
1,17,I recently posted an article asking what kind ...,7,rec.autos,i recently posted an article asking what kind ...
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,it depends on your priorities a lot of people ...
3,56,an excellent automatic can be found in the sub...,7,rec.autos,an excellent automatic can be found in the sub...
4,64,: Ford and his automobile. I need information...,7,rec.autos,ford and his automobile i need information on ...
...,...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt,secrecy in clipper chip the serial number of t...
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt,hi i am interested in the source of feal encry...
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt,the actual algorithm is classified however the...
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt,this appears to be generic calling upon the na...


In [None]:
text = data_df['cleanText'].values.tolist()
label = data_df['target'].tolist()

# TF -IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(ngram_range=(4,4), max_features=2000)
tf_emb = tfidf.fit_transform(text).toarray()
label = data_df['target'].tolist()

In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

tf_emb[1]

In [None]:
X_train, X_val, y_train, y_val = train_test_split( tf_emb, label, test_size= 0.05, shuffle=True, random_state=42)

In [None]:
import sklearn
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(sklearn.metrics.confusion_matrix(y_val, y_pred))
print(sklearn.metrics.classification_report(y_val, y_pred))

# Fast text
Supports pre-trained word vectors for 157 languages, trained on Common Crawl and Wikipedia using fastText. These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5

In [None]:
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz" #Will take time

--2022-07-20 06:49:29--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz.1’


2022-07-20 06:52:36 (23.0 MB/s) - ‘cc.en.300.bin.gz.1’ saved [4503593528/4503593528]



In [None]:
!pip install fasttext
import fasttext
import fasttext.util

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 5.3 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3161547 sha256=0eb582ee8ec683999b5ddfbe70e2f2b9fca4549288000ce7caf79d409bcbdd6f
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0


In [None]:
fasttext.util.download_model('en', if_exists='ignore') 

ft_model = fasttext.load_model('cc.en.300.bin')
f_emb = [ft_model.get_word_vector(words)for words in text]



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split( f_emb, label, test_size= 0.05,shuffle=True,random_state=42)

## Ada Boost Classifier

In [None]:
import sklearn
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(sklearn.metrics.confusion_matrix(y_val, y_pred))
print(sklearn.metrics.classification_report(y_val, y_pred))

# Glove

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus

Pre-trained word vectors:

Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): *glove.6B.zip*

Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): *glove.42B.300d.zip*

Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): *glove.840B.300d.zip*

Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download): *glove.twitter.27B.zip*

glove : 
link https://nlp.stanford.edu/projects/glove/


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2022-07-20 05:40:59--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-07-20 05:40:59--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-07-20 05:41:00--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
import numpy as np
def read_glove_vecs(glove_file):
    #input: file
    #output: word to 200d vector mapping output
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return word_to_vec_map

word_to_vec_map = read_glove_vecs('glove.6B.100d.txt') # 

In [None]:
#from tqdm import tqdm
def prepare_sequence(ds, word_to_vec_map):
    #input: Series, and word_to_vec_map of size(vocab_size,200)
    #output: returns shape of (len(ds), 200)
    traintest_X = []
    for sentence in tqdm(ds):
        sequence_words = np.zeros((word_to_vec_map['any'].shape))
        for word in sentence.split():
            if word in word_to_vec_map.keys():
                temp_X = word_to_vec_map[word]
            else:
                temp_X = word_to_vec_map['#']
            #print(temp_X)
            sequence_words+=(temp_X)/len(sentence)
            #print(sequence_words)
        traintest_X.append(sequence_words)
    return np.array(traintest_X)

In [None]:
glove_emb = prepare_sequence(data_df['cleanText'], word_to_vec_map)

100%|██████████| 11314/11314 [00:06<00:00, 1705.89it/s]


In [None]:
glove_emb[1]

In [None]:
X_train, X_val, y_train, y_val = train_test_split( glove_emb, label, test_size= 0.05,shuffle=True,random_state=42)

## Ada Boost Classifier

In [None]:
import sklearn
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(sklearn.metrics.confusion_matrix(y_val, y_pred))
print(sklearn.metrics.classification_report(y_val, y_pred))

# BERT

BERT and other Transformer encoder architectures have been wildly successful on a variety of tasks in NLP (natural language processing). They compute vector-space representations of natural language that are suitable for use in deep learning models. 

The BERT family of models uses the Transformer encoder architecture to process each token of input text in the full context of all tokens before and after, hence the name: Bidirectional Encoder Representations from Transformers.

BERT models are usually pre-trained on a large corpus of text, then fine-tuned for specific tasks.


**Install requirements**

In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 33.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.2 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.6 MB/s 
Collecting toke

In [None]:
from sentence_transformers import SentenceTransformer

b_model = SentenceTransformer('bert-base-nli-mean-tokens') # alternate
bert_emb = b_model.encode(text, batch_size=50, show_progress_bar=True)
label = data_df['target'].tolist()

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/227 [00:00<?, ?it/s]

In [None]:
bert_emb[1]

In [None]:
len(bert_emb[1])

768

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split( bert_emb, label, test_size= 0.05,shuffle=True,random_state=42)

In [None]:
import sklearn
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(sklearn.metrics.confusion_matrix(y_val, y_pred))
print(sklearn.metrics.classification_report(y_val, y_pred))

# **BERT - Simple Transformers**


In [17]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.2 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-1.11.0-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 63.9 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 55.7 MB/s 
Collecting transformers>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 44.3 MB/s 
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 65.8 MB/s 
[?25hCollecting tokenizers
  Downlo

**Classification Model**

https://simpletransformers.ai/docs/classification-models/

In [18]:
from simpletransformers.classification import ClassificationModel
model=ClassificationModel('bert','bert-base-cased',num_labels=20,use_cuda=True,args={
        "reprocess_input_data" : True,
        "use_cached_eval_features":False, 
        "overwrite_output_dir": True, 
        "num_train_epochs": 1})

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Model Training

In [5]:
data_df

Unnamed: 0.1,Unnamed: 0,text,target,title,cleanText
0,0,I was wondering if anyone out there could enli...,7,rec.autos,i was wondering if anyone out there could enli...
1,17,I recently posted an article asking what kind ...,7,rec.autos,i recently posted an article asking what kind ...
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,it depends on your priorities a lot of people ...
3,56,an excellent automatic can be found in the sub...,7,rec.autos,an excellent automatic can be found in the sub...
4,64,: Ford and his automobile. I need information...,7,rec.autos,ford and his automobile i need information on ...
...,...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt,secrecy in clipper chip the serial number of t...
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt,hi i am interested in the source of feal encry...
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt,the actual algorithm is classified however the...
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt,this appears to be generic calling upon the na...


In [45]:
full_df = data_df.filter(['cleanText','target'], axis=1)
full_df

Unnamed: 0,cleanText,target
0,i was wondering if anyone out there could enli...,7
1,i recently posted an article asking what kind ...,7
2,it depends on your priorities a lot of people ...,7
3,an excellent automatic can be found in the sub...,7
4,ford and his automobile i need information on ...,7
...,...,...
11309,secrecy in clipper chip the serial number of t...,11
11310,hi i am interested in the source of feal encry...,11
11311,the actual algorithm is classified however the...,11
11312,this appears to be generic calling upon the na...,11


In [56]:
full_df = full_df.sample(frac=1).reset_index(drop=True)
full_df

Unnamed: 0,cleanText,target
0,new jersey pittsburgh first period pittsburgh ...,10
1,just wondering do you mean the lectorium rosic...,19
2,stuff deleted your logic is falty if christian...,15
3,does anyone know the phone number to a place w...,1
4,i believe you are right both scsi and scsi sup...,3
...,...,...
11309,can somebody help me out there i have just pur...,2
11310,ok the mets and o s are good examples but what...,9
11311,larry the subject content is serious as is the...,13
11312,wouldn t a a second monitor of similar type sc...,11


In [58]:
import numpy as np
tr_index = np.random.rand(len(full_df)) < 0.8 #find index for the train data(80%)

In [60]:
train_df = full_df[tr_index] # form the train data
train_df

Unnamed: 0,cleanText,target
1,just wondering do you mean the lectorium rosic...,19
3,does anyone know the phone number to a place w...,1
4,i believe you are right both scsi and scsi sup...,3
5,actually if a few minutes translates into hour...,16
6,hear hear thanks robbie you also don t read th...,7
...,...,...
11309,can somebody help me out there i have just pur...,2
11310,ok the mets and o s are good examples but what...,9
11311,larry the subject content is serious as is the...,13
11312,wouldn t a a second monitor of similar type sc...,11


In [61]:
test_df = full_df[~tr_index] # data other than train_data
test_df

Unnamed: 0,cleanText,target
0,new jersey pittsburgh first period pittsburgh ...,10
2,stuff deleted your logic is falty if christian...,15
18,hi netters i am looking for the list of univer...,12
22,press release no paris april users of esa s ol...,14
24,help i really got ripped off and i need some h...,3
...,...,...
11281,here s a simple way to convert the clipper pro...,11
11294,this post has all the earmarks of a form progr...,17
11295,kk bugunlerde jewish jokes muhabbetlerinden es...,17
11302,archive name typing injury faq keyboards versi...,13


In [62]:
model.train_model(train_df)

  0%|          | 0/9048 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1131 [00:00<?, ?it/s]

(1131, 1.2219611470621312)

Prediction

In [63]:
predictions, raw_outputs = model.predict(test_df['cleanText'].tolist())
print(predictions)

  0%|          | 0/2266 [00:00<?, ?it/s]

  0%|          | 0/284 [00:00<?, ?it/s]

[10 15 12 ... 17  3  4]


Performance

In [64]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.metrics import classification_report
print(classification_report(test_df['target'].to_list(), predictions))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51        99
           1       0.61      0.55      0.58       101
           2       0.65      0.54      0.59       126
           3       0.52      0.63      0.57       120
           4       0.59      0.50      0.54       109
           5       0.83      0.84      0.83       146
           6       0.75      0.74      0.75       124
           7       0.49      0.83      0.62       120
           8       0.76      0.69      0.72       137
           9       0.77      0.78      0.77       105
          10       0.94      0.81      0.87       121
          11       0.76      0.76      0.76       100
          12       0.63      0.67      0.65       120
          13       0.86      0.82      0.84       137
          14       0.85      0.81      0.83       111
          15       0.54      0.84      0.66       102
          16       0.72      0.69      0.71        95
          17       0.81    