<a href="https://colab.research.google.com/github/Nesan135/Django_app_internship/blob/master/Text_Classification_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow, unicodedata, re, contractions, string, spacy, time, textwrap, os, datetime, pickle, json
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras import Sequential
from sklearn.metrics import classification_report,confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

CSV_PATH = os.path.join(os.get_cwd(), 'Dataset', 'True.csv')
df = pd.read_csv(CSV_PATH)

# EDA
print(df.info())
print(df.head(5))
print(df.duplicated().sum())
print(df.isna().sum())
print(df.columns)
print(df['subject'].unique())
df.drop_duplicates()

# Observations : No null data, contains duplicates (206), all data are strings, values to predict : ['politicsNews' 'worldnews']

# Cleaning
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return ' '.join(expanded_words)

def lemmatize(text, nlp):
   doc = nlp(text)
   lemmatized_text = []
   for token in doc:
     lemmatized_text.append(token.lemma_)
   return ' '.join(lemmatized_text)

def remove_stopwords(text,nlp):          
    filtered_sentence = [] 
    doc = nlp(text)
    for token in doc:        
        if token.is_stop == False: 
          filtered_sentence.append(token.text)   
    return ' '.join(filtered_sentence)

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
fake_commas = [['“','"'],['”','"'],['‘',"'"],['’',"'"]]
start = time.time()
counter = 0

for index,data in enumerate(df['text']):
    # Standardizing Accent Characters
    data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove false commas
    for fake_comma in fake_commas:
        data = re.sub(fake_comma[0],fake_comma[1],data)    
    # remove tags
    data = re.sub(r'@\S*', '', data)
    # remove HTML tags
    data = re.sub('<.*?>','', data)
    # remove URLS
    data = re.sub(r'bit.ly?:\S*', '', data)
    # remove special char, numbers and lower case
    data = re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', ' ', data).lower()
    # expand contractions
    data = expand_contractions(data)
    # remove punctuation
    data = ''.join([c for c in data if c not in string.punctuation])
    data = lemmatize(data,nlp)
    data = remove_stopwords(data,nlp)
    # #to check :
    counter +=1
    if counter%1000 == 0:
        end = time.time()
        print(counter,end-start)
        start = end
    # commit to dataframe
    df['text'][index] = data

# feature selection
review = df['text']
sentiment = df['subject']

# unique number of words in all sentences
num_words = 5000

# out of vocab
oov_token = '<OOV>'
tokenizer = Tokenizer(num_words=num_words,oov_token=oov_token)
tokenizer.fit_on_texts(review)
# word_index = tokenizer.word_index
# print(dict(list(word_index.items())[0:10]))

# preprocessing
review = tokenizer.texts_to_sequences(review)
padded_review = pad_sequences(review, maxlen=200, padding='post', truncating='post')

ohe = OneHotEncoder(sparse=False)
sentiment = ohe.fit_transform(sentiment[::,None])

padded_review = np.expand_dims(padded_review, axis=-1)

X_train,X_test,y_train,y_test = train_test_split(padded_review,sentiment,test_size=0.2,random_state=123)

embedding_layer = 64
model = Sequential()
model.add(Embedding(num_words, embedding_layer))
model.add(LSTM(embedding_layer, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(2, activation = 'softmax'))
model.summary()

model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['acc'])

log_path = os.path.join('log_dir','time_series',datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tb_callback = TensorBoard(log_dir=log_path)
es_callback = EarlyStopping(monitor='val_loss',patience=5,verbose=0,restore_best_weights=True)
model_callback = ModelCheckpoint('best_weights.h5', monitor='val_loss', save_best_only='True', verbose=1)
hist = model.fit(X_train, y_train, validation_data = (X_test,y_test), epochs=40,batch_size=64, callbacks=[es_callback,tb_callback,model_callback])

print('hist keys :',hist.history.keys())

plt.figure()
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.legend(['training','validation'])
plt.show()

y_test = np.argmax(y_test, axis=1)
y_predicted = model.predict(X_test)
y_predicted = np.argmax(y_predicted, axis=1)
print(classification_report(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))

# model saving
model.save('model.h5')
with open('ohe.pkl', 'wb') as f:
    pickle.dump(ohe,f)

token_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    json.dump(token_json,f)