In [None]:
import pandas as pd
import numpy as np
from numpy import array
import codecs
from os import listdir
import codecs
import re
import string
import pickle
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils.vis_utils import plot_model
language = 'ENGLISH'
path_prefix = ''
if language == 'PERSIAN':
    path_prefix = 'English files/'

In [None]:
def load_docs(given_path):
    docs = []
    counter = 0
    for fileName in listdir(given_path):
        counter += 1
        file = open(given_path + fileName, 'r', encoding='utf-8')
        text = file.read()
        docs.append(text.split())
        file.close()
        if language == 'ENGLISH' and counter == 50000: break
    return docs

formal_male_data = None
formal_female_data = None
if language == 'PERSIAN':
    formal_male_data = load_docs('Gender tagged corpus cleaned/NEW/formal language/male/')
    formal_female_data = load_docs('Gender tagged corpus cleaned/NEW/formal language/female/')
    print('PERSIAN docs loaded')
else:
    formal_male_data = load_docs('Gender tagged corpus English/male/')
    formal_female_data = load_docs('Gender tagged corpus English/female/')
    print('ENGLISH docs loaded')

all_data = formal_male_data + formal_female_data
all_labels = [1] * len(formal_male_data) + [0] * len(formal_female_data)
print(len(all_data))
print(len(formal_male_data), len(formal_female_data))

In [None]:
file = None
if language == 'PERSIAN':
    file = open('Persian Word Embedding/cc.fa.300.vec', 'r', encoding='utf-8', errors='ignore')
else:
    file = open('English Word Embedding/cc.en.300.vec', 'r', encoding='utf-8', errors='ignore')

vocab_and_vectors = {}

for line in file:
    values = line.split()
    word = values[0].encode('utf-8').decode('utf-8')
    vector = np.asarray(values[1:], dtype='float32')
    vocab_and_vectors[word] = vector

print(len(vocab_and_vectors))

In [None]:
# import these modules 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

In [None]:
features = 300
tokenizer = Tokenizer(num_words = features)
tokenizer.fit_on_texts(all_data)
with open(path_prefix + 'GenderTokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Tokenizer saved.")
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(all_data)
max_length = max([len(sent) for sent in X])
vocab_size = len(tokenizer.word_index) + 1
X = pad_sequences(X, padding='post')
y = all_labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)
print(len(X_train), len(X_val), len(X_test))
print(vocab_size, max_length)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = vocab_and_vectors.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def define_model(vocab_size, length):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    LSTM1 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool1)
    flat1 = Flatten()(LSTM1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    LSTM2 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool2)
    flat2 = Flatten()(LSTM2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    LSTM3 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool3)
    flat3 = Flatten()(LSTM3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu' )(merged)
    outputs = Dense(1, activation='sigmoid' )(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    model. compile(loss='binary_crossentropy' , optimizer='adam' , metrics=['accuracy'])
    model.summary()
    
    return model

model = define_model(vocab_size, max_length)
checkpoint = ModelCheckpoint(path_prefix + "model.h5", monitor='val_acc', save_best_only=True, mode='max')
model.fit([X_train, X_train, X_train], y_train, epochs=10, validation_data = ([X_val, X_val, X_val], y_val), batch_size=256, callbacks=[checkpoint])
if language == 'PERSIAN':
    model.save('saved models/Keras/model.h5' )
else:
    model.save(path_prefix + 'modelMain.h5' )

In [None]:
model = None
if language == 'PERSIAN':
    model = load_model('saved models/Keras/model.h5' )
else:
    model = load_model(path_prefix + 'modelMain.h5' )
    
_, acc = model.evaluate([X_train, X_train, X_train], y_train)
print('Train Accuracy: %.2f' % (acc*100))
d, acc = model.evaluate([X_test, X_test, X_test], y_test)
print('Test Accuracy: %.2f' % (acc*100), d)

In [None]:
yhats = model.predict([X_test, X_test, X_test])
right_or_wrong = []
for i in range(0, len(X_test)):
    prediction = 0 if yhats[i] < 0.5 else 1
    if prediction == y_test[i]:
        right_or_wrong.append('RIGHT')
    else:
        right_or_wrong.append('WRONG')
        
print(len(right_or_wrong), len(X_test))

In [None]:
def index_to_seq(sequence):
    main_sentence = []
    for tokenNumber in sequence:
        if tokenNumber != 0: 
            tokenWord = list(word_index.keys())[list(word_index.values()).index(tokenNumber)]
            main_sentence.append(tokenWord)
    return main_sentence
    
print(X_test[0])
print(index_to_seq(X_test[0]))

In [None]:
import codecs
file = codecs.open(path_prefix + 'Training.txt', 'w', "utf-8")
counter = 0
for eachDoc in X_train:
    counter += 1
    print(counter)
    for eachToken in index_to_seq(eachDoc):
        file.write(eachToken + '\n')
    file.write("====\n")
file.close()
print("1")
counter = 0
file = codecs.open(path_prefix + 'TrainingAnswers.txt', 'w', "utf-8")
for eachAnswer in y_train:
    counter += 1
    print(counter)
    file.write(str(eachAnswer) + '\n')
file.close()
print("2")
file = codecs.open(path_prefix + 'Validing.txt', 'w', "utf-8")
counter = 0
for eachDoc in X_val:
    counter += 1
    print(counter)
    for eachToken in index_to_seq(eachDoc):
        file.write(eachToken + '\n')
    file.write("====\n")
file.close()
print("3")
counter = 0
file = codecs.open(path_prefix + 'ValidingAnswers.txt', 'w', "utf-8")
for eachAnswer in y_val:
    counter += 1
    print(counter)
    file.write(str(eachAnswer) + '\n')
file.close()
print("4")
file = codecs.open(path_prefix + 'Testing.txt', 'w', "utf-8")
counter = 0
for eachDoc in X_test:
    counter += 1
    print(counter)
    for eachToken in index_to_seq(eachDoc):
        file.write(eachToken + '\n')
    file.write("====\n")
file.close()
print("5")
counter = 0
file = codecs.open(path_prefix + 'TestingAnswers.txt', 'w', "utf-8")
for eachAnswer in y_test:
    counter += 1
    print(counter)
    file.write(str(eachAnswer) + '\n')
file.close()
print("6")
counter = 0
file = codecs.open(path_prefix + 'TestingSanity.txt', 'w', "utf-8")
for eachAnswer in right_or_wrong:
    counter += 1
    print(counter)
    file.write(str(eachAnswer) + '\n')
file.close()
print("7")

In [None]:
def clear_doc(text, vocab):
    tokens = text.split()
    re_punc = re. compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub(' ', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if len(word) > 2]
    tokens = [w for w in tokens if w in vocab]
    return tokens
    
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clear_doc(review, vocab)
    # encode and pad review
    padded = pad_sequences(X, padding='post')
    # predict sentiment
    print("Predicting...")
    yhat = model.predict([padded, padded, padded])
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return percent_pos, 'FEMALE'
    return percent_pos, 'MALE'

text = input('Enter Test Case: ')
vocab = list(vocab_and_vectors.keys())
percent, prediction = predict_sentiment(text, vocab, tokenizer, max_length, model)
print(percent, prediction)