In [None]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# For reproducibility
np.random.seed(1237)
import pathlib
# Source file directory
path_train = "/content/drive/MyDrive/text classification/bbc"

files_train = skds.load_files(path_train,load_content=False)

label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames

data_tags = ["filename","category","news"]
data_list = []

# Read and add data from file to a list
i=0
for f in labelled_files:
  data_list.append((f,label_names[label_index[i]],pathlib.Path(f).read_text()))
  i += 1

# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)
data

Unnamed: 0,filename,category,news
0,/content/drive/MyDrive/text classification/bbc...,tech,Net regulation 'still possible'\n\nThe blurrin...
1,/content/drive/MyDrive/text classification/bbc...,business,VW considers opening Indian plant\n\nVolkswage...
2,/content/drive/MyDrive/text classification/bbc...,business,Court rejects $280bn tobacco case\n\nA US gove...
3,/content/drive/MyDrive/text classification/bbc...,business,AstraZeneca hit by drug failure\n\nShares in A...
4,/content/drive/MyDrive/text classification/bbc...,business,J&J agrees $25bn Guidant deal\n\nPharmaceutica...
...,...,...,...
1515,/content/drive/MyDrive/text classification/bbc...,politics,Labour chooses Manchester\n\nThe Labour Party ...
1516,/content/drive/MyDrive/text classification/bbc...,politics,Report attacks defence spending\n\nThe Ministr...
1517,/content/drive/MyDrive/text classification/bbc...,tech,"Rich pickings for hi-tech thieves\n\nViruses, ..."
1518,/content/drive/MyDrive/text classification/bbc...,entertainment,McCririck out of Big Brother show\n\nRacing pu...


In [None]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [None]:
# 4 news groups
num_labels = 4
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
 
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [None]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))



model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1,
                    validation_split=0.1)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               7680512   
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_4 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)              

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test accuracy:', score[1])
print(score)
text_labels = encoder.classes_
print(text_labels)
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(predicted_label)
    print(test_files_names.iloc[i])
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.9671052694320679
[0.1782637983560562, 0.9671052694320679]
['business' 'entertainment' 'politics' 'tech']
tech
/content/drive/MyDrive/text classification/bbc/tech/283.txt
[[3.5508184e-14 6.0780220e-16 4.2555460e-15 1.0000000e+00]]
Actual label:tech
Predicted label: tech
entertainment
/content/drive/MyDrive/text classification/bbc/entertainment/076.txt
[[9.0399063e-11 1.0000000e+00 1.3545637e-09 1.0047584e-08]]
Actual label:entertainment
Predicted label: entertainment
entertainment
/content/drive/MyDrive/text classification/bbc/entertainment/205.txt
[[2.4268488e-04 9.8494339e-01 3.6677278e-03 1.1146266e-02]]
Actual label:entertainment
Predicted label: entertainment
business
/content/drive/MyDrive/text classification/bbc/business/148.txt
[[9.9979109e-01 2.5054817e-07 1.9407533e-04 1.4652585e-05]]
Actual label:business
Predicted label: business
business
/content/drive/MyDrive/text classification/bbc/business/161.txt
[[9.9975365e-01 2.4006132e-08 2.4549730e-04 8.6029343e-07

In [None]:
# creates a HDF5 file 'my_model.h5'
model = model.save('my_model.h5')
 
# Save Tokenizer i.e. Vocabulary
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load our saved model
from keras.models import load_model
model = load_model('my_model.h5')
 
# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
    
encoder.classes_ #LabelBinarizer

array(['business', 'entertainment', 'politics', 'tech'], dtype='<U13')

In [None]:
# These are the labels we stored from our training
# The order is very important here.
 
labels = np.array(['business', 'entertainment', 'politics', 'tech'])
 
test_files = ["/content/drive/My Drive/text classification/news.txt"]
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)
 
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print(prediction)
    print("File ->", test_files[i], "Predicted label: " + predicted_label)

[[0.8029984  0.03982589 0.04494743 0.1122283 ]]
File -> /content/drive/My Drive/text classification/news.txt Predicted label: business


In [None]:
# TEXT CLASSIFICATION USING LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd


embedding_dim=50
model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=15000))
model.add(layers.LSTM(units=50,return_sequences=True))
model.add(layers.LSTM(units=10))
model.add(layers.Dropout(0.5))

model.add(layers.Dense(16))
model.add(layers.Dense(3, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
model.summary()

/content/drive/My Drive/text classification


In [None]:
!unzip bbc.zip

Archive:  bbc.zip
   creating: bbc/
   creating: bbc/business/
  inflating: bbc/business/001.txt    
  inflating: bbc/business/002.txt    
  inflating: bbc/business/003.txt    
  inflating: bbc/business/004.txt    
  inflating: bbc/business/005.txt    
  inflating: bbc/business/006.txt    
  inflating: bbc/business/007.txt    
  inflating: bbc/business/008.txt    
  inflating: bbc/business/009.txt    
  inflating: bbc/business/010.txt    
  inflating: bbc/business/011.txt    
  inflating: bbc/business/012.txt    
  inflating: bbc/business/013.txt    
  inflating: bbc/business/014.txt    
  inflating: bbc/business/015.txt    
  inflating: bbc/business/016.txt    
  inflating: bbc/business/017.txt    
  inflating: bbc/business/018.txt    
  inflating: bbc/business/019.txt    
  inflating: bbc/business/020.txt    
  inflating: bbc/business/021.txt    
  inflating: bbc/business/022.txt    
  inflating: bbc/business/023.txt    
  inflating: bbc/business/024.txt    
  inflating: bbc/business

In [None]:
import pandas
import os
cat=[]
news = []

cat = os.listdir('/content/drive/My Drive/text classification/bbc')

cat.remove('sport')
print(cat)
file = os.listdir('/content/drive/My Drive/text classification/bbc/business')
print(file)

['business', 'entertainment', 'politics', 'tech']
['247.txt', '337.txt', '301.txt', '328.txt', '152.txt', '123.txt', '353.txt', '308.txt', '190.txt', '313.txt', '070.txt', '321.txt', '329.txt', '258.txt', '225.txt', '217.txt', '155.txt', '101.txt', '344.txt', '380.txt', '358.txt', '310.txt', '277.txt', '067.txt', '091.txt', '241.txt', '239.txt', '338.txt', '267.txt', '047.txt', '186.txt', '098.txt', '198.txt', '268.txt', '280.txt', '342.txt', '213.txt', '359.txt', '244.txt', '179.txt', '004.txt', '317.txt', '351.txt', '339.txt', '231.txt', '232.txt', '142.txt', '243.txt', '238.txt', '105.txt', '035.txt', '160.txt', '057.txt', '054.txt', '021.txt', '187.txt', '201.txt', '175.txt', '269.txt', '042.txt', '129.txt', '150.txt', '032.txt', '093.txt', '087.txt', '205.txt', '294.txt', '058.txt', '246.txt', '075.txt', '154.txt', '284.txt', '349.txt', '372.txt', '052.txt', '002.txt', '141.txt', '156.txt', '218.txt', '245.txt', '083.txt', '371.txt', '085.txt', '028.txt', '099.txt', '374.txt', '16

In [None]:

c =0
for j in file:
  # for i in cat:
    
    b = open(f"/content/drive/My Drive/text classification/bbc/tech/{j}",encoding='utf-8')
    
    c = b.read()
    cat.append(i)
    news.append(c)
    
data4 = {'news':news}
data4 = pandas.DataFrame(data4)
data4

Unnamed: 0,news
0,Renault boss hails 'great year'\n\nStrong sale...
1,Building giant in asbestos payout\n\nAustralia...
2,Libya takes $1bn in unfrozen funds\n\nLibya ha...
3,Bush to get 'tough' on deficit\n\nUS president...
4,Crude oil prices back above $50\n\nCold weathe...
...,...
1515,Microsoft takes on desktop search\n\nMicrosoft...
1516,Halo 2 heralds traffic explosion\n\nThe growin...
1517,More power to the people says HP\n\nThe digita...
1518,Mobile picture power in your pocket\n\nHow man...


In [None]:
data4 = pandas.DataFrame(data4)
data4

ValueError: ignored

In [None]:
data1 = data.sample(frac=1)

In [None]:
data2 = data2.sample(frac=1)

In [None]:
data3 = data3.sample(frac=1)

In [None]:
data4 = data4.sample(frac=1)

In [None]:
final_data = pandas.concat([data1,data2,data3,data4])
final_data

Unnamed: 0,news
66,Wall Street cheers Bush victory\n\nThe US stoc...
346,Jobs growth still slow in the US\n\nThe US cre...
309,Fresh hope after Argentine crisis\n\nThree yea...
127,Car giant hit by Mercedes slump\n\nA slump in ...
77,Industrial revival hope for Japan\n\nJapanese ...
...,...
16,Macy's owner buys rival for $11bn\n\nUS retail...
28,Nasdaq planning $100m share sale\n\nThe owner ...
152,Soaring oil 'hits world economy'\n\nThe soarin...
156,Cuba winds back economic clock\n\nFidel Castro...


In [None]:
import pickle
import sys
dict={1:'hello',2:'2'}
file_1=open('file1','wb')
pickle.dump(dict,file_1)
file_1.close()

file_2=open('file1','rb')
obj=pickle.load(file_2)
print(obj)
file_2.close()


{1: 'hello', 2: '2'}


In [None]:
data4 = data4.sample(frac=1)
data4

Unnamed: 0,news
1195,A question of trust and technology\n\nA major ...
1231,"Yahoo celebrates a decade online\n\nYahoo, one..."
1433,Games firms 'face tough future'\n\nUK video ga...
787,Lib Dems highlight problem debt\n\nPeople vuln...
93,S Korean credit card firm rescued\n\nSouth Kor...
...,...
89,Ukraine revisits state sell-offs\n\nUkraine is...
71,Iraqi voters turn to economic issues\n\nBeyond...
201,Safety alert as GM recalls cars\n\nThe world's...
1104,UKIP candidate suspended in probe\n\nEuroscept...


In [None]:
# TEXT CLASSIFICATION USING LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd


embedding_dim=50
model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=15000))
model.add(layers.LSTM(units=300,return_sequences=True))
model.add(layers.LSTM(units=100))
model.add(layers.Dropout(0.5))

model.add(layers.Dense(16))
model.add(layers.Dense(4, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
model.summary()
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 15000, 50)         750000    
_________________________________________________________________
lstm_8 (LSTM)                (None, 15000, 300)        421200    
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 16)                1616      
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 68        
Total params: 1,333,284
Trainable params: 1,333,284
Non-trainable params: 0
____________________________________________