<a href="https://colab.research.google.com/github/HebaSedik/deep-learning/blob/master/text_classification_using_RNN_steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***load & pre-process data***

In [3]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
data = pd.read_csv("/content/bbc-text.csv")
data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
1833,business,us gives foreign firms extra time foreign firm...
1834,politics,school sport is back says pm tony blair has...
1835,sport,henman overcomes rival rusedski tim henman sav...
1836,sport,saint-andre anger at absent stars sale sharks ...


In [4]:
vocab_size=1000
embedding_dim=16
max_length=120
trunc_type='post'
padding_type='post'
oov_tok="<OOV>"  #out of vocabulary
training_portion=0.8

In [5]:
sentences =[]
labels=[]
stopwords=['a','about','above','after','again' ,"all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into","is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that","that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was","we","we'd","we'll", "we're","we've","were","what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]
print(len(stopwords))

152


In [6]:
with open("/content/bbc-text.csv", 'r') as csvfile:
  reader = csv.reader(csvfile,delimiter=',')
  for row in reader:
    labels.append(row[0])
    sentence = row[1]
    for word in stopwords:
      token = " "+word+" "
      sentence=sentence.replace(token, " ")
    sentences.append(sentence)
  print(len(labels))
  print(len(sentences))
  print(sentences[1])

2226
2226
tv future hands viewers home theatre systems  plasma high-definition tvs  digital video recorders moving living room  way people watch tv will radically different five years  time.  according expert panel gathered annual consumer electronics show las vegas discuss new technologies will impact one favourite pastimes. us leading trend  programmes content will delivered viewers via home networks  cable  satellite  telecoms companies  broadband service providers front rooms portable devices.  one talked-about technologies ces digital personal video recorders (dvr pvr). set-top boxes  like us s tivo uk s sky+ system  allow people record  store  play  pause forward wind tv programmes want.  essentially  technology allows much personalised tv. also built-in high-definition tv sets  big business japan us  slower take off europe lack high-definition programming. not can people forward wind adverts  can also forget abiding network channel schedules  putting together a-la-carte entertai

In [7]:
train_size=int(len(sentences) *training_portion)
train_sentences= sentences[:train_size]
train_labels=labels[:train_size]
validation_sentences= sentences[train_size:]
validation_labels=labels[train_size:]
print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

1780
1780
1780
446
446


In [10]:
tokenizer=Tokenizer(num_words= vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index=tokenizer.word_index
train_sequences=tokenizer.texts_to_sequences(train_sentences)
train_padded=pad_sequences(train_sequences,padding= padding_type,maxlen=max_length)
print(len(train_sequences[1]))
print(len(train_padded[0]))
print(len(train_sequences[2]))
print(len(train_padded[1]))
print(len(train_sequences[11]))
print(len(train_padded[10]))


449
120
200
120
192
120


In [11]:
validation_sequences=tokenizer.texts_to_sequences(validation_sentences)
validation_padded=pad_sequences(train_sequences,padding= padding_type,maxlen=max_length)
print(len(validation_sequences))
print(validation_padded.shape)

446
(1780, 120)


In [12]:
label_tokenizer=Tokenizer()
label_tokenizer.fit_on_texts(labels)
train_label_seq=np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq=np.array(label_tokenizer.texts_to_sequences(validation_labels))
print(train_label_seq[0])
print(train_label_seq[1])
print(train_label_seq[2])
print(train_label_seq.shape)

[6]
[4]
[2]
(1780, 1)


In [13]:
print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[5]
[5]
[4]
(446, 1)


# ***Building and train RNN model***

In [14]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(6,activation='softmax')
    ])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           16000     
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 6)                 150       
                                                                 
Total params: 16,558
Trainable params: 16,558
Non-trainable params: 0
_________________________________________________________________


In [18]:
num_epochs=30
history=model.fit(train_padded, train_label_seq,epochs= num_epochs,validation_data=(validation_padded, validation_label_seq),verbose=2)

Epoch 1/30


InvalidArgumentError: ignored

Plot model train

In [None]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("epochs")
  plt.ylabel(string)
  plt.legend([string,'val_'+string])
  plt.show()
plot_graphs(history,"accuracy")
plot_graphs(history,"loss")

dataset
# ***https://maharatech.gov.eg/mod/resource/view.php?id=7868***