In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from numpy import array
import codecs
from os import listdir
import codecs
import re
import string
import pickle
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils.vis_utils import plot_model
language = 'ENGLISH'
path_prefix = ''
if language == 'ENGLISH':
    path_prefix = 'English files/'

In [None]:
def cleanUp(inp):
    tokens = inp.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub(' ', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

def load_docs(given_gender):
  docs = []
  file = open('/content/drive/My Drive/Paper/classtrain.txt', 'r', encoding='utf-8')
  text = file.read().split('\n')
  for each in text:
    gender = each.split('\t')[0]
    comment = each.split('\t')[1]
    if gender == 'male' and given_gender == 'male':
      docs.append(cleanUp(comment))
    elif gender == 'female' and given_gender == 'female':
      docs.append(cleanUp(comment))
  file.close()
  return docs

formal_male_data = None
formal_female_data = None

formal_male_data = load_docs('male')
formal_female_data = load_docs('female')
print('ENGLISH docs loaded')

all_data = formal_male_data + formal_female_data
all_labels = [1] * len(formal_male_data) + [0] * len(formal_female_data)
print(len(all_data))
print(len(formal_male_data), len(formal_female_data))

In [None]:
formal_male_data

['do visit this place',
 'great low cost company',
 'came here saturday for the ufc fights',
 'everything was hot and crispy',
 'that enough to check the place out',
 'the managers seemed to be running nice place',
 'it very nice to feel remembered',
 'she choose the grande',
 'at the joint we were treated as customers',
 'nice selection of bbq and sides']

In [None]:
file = None
if language == 'PERSIAN':
    file = open('Persian Word Embedding/cc.fa.300.vec', 'r', encoding='utf-8', errors='ignore')
else:
    file = open('English Word Embedding/cc.en.300.vec', 'r', encoding='utf-8', errors='ignore')

vocab_and_vectors = {}

for line in file:
    values = line.split()
    word = values[0].encode('utf-8').decode('utf-8')
    vector = np.asarray(values[1:], dtype='float32')
    vocab_and_vectors[word] = vector

print(len(vocab_and_vectors))

In [None]:
# import these modules 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [None]:
features = 300
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_data)
with open('/content/drive/My Drive/Paper/English files/GenderTokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Tokenizer saved.")
word_index = tokenizer.word_index
max_length = max([len(sent) for sent in all_data])
vocab_size = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(all_data)
X = pad_sequences(X, padding='post')
y = all_labels
print(all_data[1])
print(X[1])
print(y[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, shuffle=True)
print(len(X_train), len(X_val), len(X_test))
print(vocab_size, max_length)

Tokenizer saved.
['nice', 'selection', 'of', 'bbq', 'and', 'sides']
[ 78 234   5 424   2 733   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]
1
2062289 257787 257786
20071 96


In [None]:
def define_model(vocab_size, length):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    LSTM1 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool1)
    flat1 = Flatten()(LSTM1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    LSTM2 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool2)
    flat2 = Flatten()(LSTM2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    LSTM3 = LSTM(256, return_sequences=True, recurrent_dropout=0.2)(pool3)
    flat3 = Flatten()(LSTM3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu' )(merged)
    outputs = Dense(1, activation='sigmoid' )(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    model. compile(loss='binary_crossentropy' , optimizer='adam' , metrics=['accuracy'])
    model.summary()
    
    return model 

model = define_model(vocab_size, max_length)
checkpoint = ModelCheckpoint("/content/drive/My Drive/Paper/English files/model.h5", monitor='val_accuracy', save_best_only=True, mode='max')
model.fit([X_train, X_train, X_train], y_train, epochs=20, validation_data = ([X_val, X_val, X_val], y_val), batch_size=512, callbacks=[checkpoint])
model.save('/content/drive/My Drive/Paper/English files/modelMain.h5' )

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 96)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 96)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 96, 100)      2007100     input_1[0][0]                    
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2062289 samples, validate on 257787 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model = load_model('/content/drive/My Drive/Paper/English files/modelMain.h5' )
_, acc = model.evaluate([X_train, X_train, X_train], y_train)
print('Train Accuracy: %.2f' % (acc*100))
d, acc = model.evaluate([X_test, X_test, X_test], y_test)
print('Test Accuracy: %.2f' % (acc*100), d)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train Accuracy: 92.53
Test Accuracy: 78.20 0.5225570126839063
