<a href="https://colab.research.google.com/github/MuhammadAbdullah-hash/NLP-for-text-classification/blob/master/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import tensorflow
import keras
import sklearn.model_selection
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [67]:
# ############ Loading and splitting data set ################# #

train_data = pd.read_csv('train.csv')

tweet = np.array(train_data['text'] , dtype = 'str')
target = np.array(train_data['target'])

x_train = tweet[0:6850]
y_train  = target[0:6850]

x_test = tweet[6851:]
y_test  = target[6851:]


print(x_train.dtype)
print(y_train.dtype)

print(x_train.shape , y_train.shape)
print(x_test.shape , y_test.shape)

<U157
int64
(6850,) (6850,)
(762,) (762,)


In [0]:
# ############ Tokenizing / Sequencing / Padding ############### #

tokenizer = Tokenizer(num_words = 10000 , oov_token= "<OOV>")
tokenizer.fit_on_texts(x_train) ######### This only generates words data base of train #########
word_index = tokenizer.word_index

# ######### We dont tokenize test so that we dont have dat base of test words ############# #

sequence_train = tokenizer.texts_to_sequences(x_train)
sequence_test = tokenizer.texts_to_sequences(x_test)

pad_train = pad_sequences(sequence_train , padding = 'pre')
pad_test = pad_sequences(sequence_test , padding = 'pre')

# print(pad_train[0].size)
# print(pad_test[0].size)
# print(pad_test[0])

# print(x_train[0])
# print(sequence_train[0])
# print()
# print(x_test[0])
# print(sequence_test[0])

In [0]:
 # ############# Model Creation ############### #

model = keras.Sequential([
keras.layers.Embedding( 10000 , 16 ),  # ######### Creates vectors in diff dimensions ############ #
keras.layers.GlobalAveragePooling1D(), ###### Sum up vectors to understand context ########

# #### Output and Dense layers #### #

keras.layers.Dense(24, activation='relu'),
keras.layers.Dense(1, activation='sigmoid'),

])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [83]:
model.summary()

model.fit(pad_train , y_train , epochs=30 , validation_data=(pad_test , y_test))

loss,  acc = model.evaluate(pad_test , y_test)

print(loss , acc)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6850 samples, validate on 762 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
1.1646727813222002 0.7467191815376282


In [99]:
# ######### Checking on validation data set ############ #

predictions = model.predict(pad_test)

for i in range(len(predictions)):
  print(  np.round(predictions[i] , 0)   , '\t' , y_test[i])

[1.] 	 0
[0.] 	 1
[1.] 	 1
[0.] 	 1
[1.] 	 1
[0.] 	 1
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[1.] 	 0
[1.] 	 1
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 1
[1.] 	 1
[1.] 	 0
[1.] 	 0
[0.] 	 0
[1.] 	 0
[1.] 	 0
[1.] 	 1
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 1
[0.] 	 0
[0.] 	 0
[1.] 	 0
[1.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 1
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 1
[0.] 	 0
[0.] 	 1
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 1
[0.] 	 0
[0.] 	 1
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 1
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[1.] 	 1
[1.] 	 1
[1.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 0
[0.] 	 1
[0.] 	 0
[0.] 	 0
[0.] 	 1
[1.] 	 1
[0.] 	 0
[0.] 	 0
[1.] 	 0
[0.] 	 0
[1.] 	 0
[1.] 	 1
[0.] 	 0
[

In [116]:
# ########### Making sample submission file ############### #

data = pd.read_csv('test.csv')

sen = np.array(data['text'])
id_num = np.array(data['id'])


sequence_sen = tokenizer.texts_to_sequences(sen)
pad_sen = pad_sequences(sequence_sen , padding='pre')


# print(sen[0])
# print(sequence_sen[0])
# print(pad_sen[0])

# ############## Predictions ############## #

prediction = model.predict(pad_sen)
prediction = np.round(prediction , 0)
final_add = [int(j) for i in prediction for j in i]

print(final_add[0])
print(type(final_add[0]))
z  = { 
'id' :  id_num ,    
'target' : final_add , 
}

sample = pd.DataFrame(z)
sample.to_csv('Sample.csv')



# for i in range(len(prediction)):
#   print( prediction[i])
#   z.update(  {  }  )
#   # print( np.round(  prediction[i])   )

0
<class 'int'>
