In [1]:
import pandas as pd
import numpy as np
import re

In [73]:
df = pd.read_csv('volunteer_world_SDG1.csv', encoding='utf-8')
df2 = pd.read_csv('volunteer_world_SDG2.csv', encoding='utf-8')
df = df.append(df2, ignore_index=True)
des = df['Description'].tolist()

In [75]:
df['res']=0
df['res'][:15]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [94]:
res = df['res'].tolist()
print(len(res))

60


In [3]:
import string 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [77]:
des_lines=list()
for line in des:
    line = line.replace('\\n','')
    line = line.replace('\\xa05','')
    line = line.replace('\\xa0','')
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    des_lines.append(words)
print(len(des_lines))

60


In [19]:
import gensim

In [78]:
EMBEDDING_DIM=100
MIN_COUNT=4
WINDOW=5 # the max distance btw a target word and words around it.
WORKERS=3 #the num of partitions during training

In [79]:
model = gensim.models.Word2Vec(sentences=des_lines, size=EMBEDDING_DIM, 
                               window=WINDOW, min_count=MIN_COUNT, workers=WORKERS)
words = list(model.wv.vocab)
print('Vocab size:{}'.format(len(words)))

W1021 20:48:57.699760  1140 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


Vocab size:1316


In [80]:
model.wv.most_similar('children')

[('volunteers', 0.9998090863227844),
 ('project', 0.9997903108596802),
 ('local', 0.9997694492340088),
 ('work', 0.999762773513794),
 ('like', 0.9997480511665344),
 ('health', 0.999747633934021),
 ('one', 0.9997432231903076),
 ('help', 0.9997345209121704),
 ('families', 0.9997216463088989),
 ('activities', 0.9997215270996094)]

In [81]:
filename = 'SDG1_Word2Vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [82]:
import os
embeddings_index={}
f = open(os.path.join('', 'SDG1_Word2Vec.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [44]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [83]:
# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(des_lines)
sequences = tokenizer_obj.texts_to_sequences(des_lines)

In [84]:
# pad sequences
word_index = tokenizer_obj.word_index
print('{} unique tokens.'.format(len(word_index)))

6444 unique tokens.


In [85]:
des_pad = pad_sequences(sequences, maxlen=1000)
des_pad.shape

(60, 1000)

In [92]:
des_pad

array([[   0,    0,    0, ...,  352,   21,   67],
       [   0,    0,    0, ...,  262,   92,  632],
       [   0,    0,    0, ...,   87, 1816,    1],
       ...,
       [   0,    0,    0, ...,  667,  442,   81],
       [   0,    0,    0, ..., 6363, 6364, 6365],
       [   0,    0,    0, ..., 1878,    2,  905]])

In [86]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [87]:
print(num_words)

6445


In [61]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

In [88]:
model = Sequential()
embedding_layer = Embedding(num_words, EMBEDDING_DIM, 
                            embeddings_initializer = Constant(embedding_matrix),
                           trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [89]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         644500    
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12864     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 657,397
Trainable params: 12,897
Non-trainable params: 644,500
_________________________________________________________________


In [111]:
VALIDATION_SPLIT=0.2

indices = np.arange(des_pad.shape[0])
np.random.shuffle(indices)
des_pad = des_pad[indices]
res = np.array(res)
res = res[indices.astype(int)]
num_vali_samples = int(VALIDATION_SPLIT*des_pad.shape[0])

In [112]:
X_train_pad = des_pad[:-num_vali_samples]
y_train = res[:-num_vali_samples]
X_test_pad = des_pad[-num_vali_samples:]
y_test = res[-num_vali_samples:]

In [113]:
print('Shape of X_train_pad tensor: {}'.format(X_train_pad.shape))
print('Shape of y_train tensor: {}'.format(y_train.shape))
print('Shape of X_test_pad tensor: {}'.format(X_test_pad.shape))
print('Shape of y_test tensor: {}'.format(y_test.shape))

Shape of X_train_pad tensor: (48, 1000)
Shape of y_train tensor: (48,)
Shape of X_test_pad tensor: (12, 1000)
Shape of y_test tensor: (12,)


In [115]:
model.fit(X_train_pad, y_train, batch_size=2, epochs=15, 
          validation_data=(X_test_pad, y_test), verbose=2)

Epoch 1/15
24/24 - 9s - loss: 0.6556 - accuracy: 0.7292 - val_loss: 0.5582 - val_accuracy: 0.8333
Epoch 2/15
24/24 - 7s - loss: 0.6171 - accuracy: 0.7292 - val_loss: 0.5366 - val_accuracy: 0.8333
Epoch 3/15
24/24 - 9s - loss: 0.6139 - accuracy: 0.7292 - val_loss: 0.5377 - val_accuracy: 0.8333
Epoch 4/15
24/24 - 11s - loss: 0.6072 - accuracy: 0.7292 - val_loss: 0.5275 - val_accuracy: 0.8333
Epoch 5/15
24/24 - 7s - loss: 0.6001 - accuracy: 0.7292 - val_loss: 0.5122 - val_accuracy: 0.8333
Epoch 6/15
24/24 - 7s - loss: 0.6062 - accuracy: 0.7292 - val_loss: 0.5187 - val_accuracy: 0.8333
Epoch 7/15
24/24 - 12s - loss: 0.5954 - accuracy: 0.7292 - val_loss: 0.5114 - val_accuracy: 0.8333
Epoch 8/15
24/24 - 7s - loss: 0.6105 - accuracy: 0.7292 - val_loss: 0.5008 - val_accuracy: 0.8333
Epoch 9/15
24/24 - 7s - loss: 0.6101 - accuracy: 0.7292 - val_loss: 0.5257 - val_accuracy: 0.8333
Epoch 10/15
24/24 - 10s - loss: 0.5862 - accuracy: 0.7292 - val_loss: 0.5042 - val_accuracy: 0.8333
Epoch 11/15
24/2

<tensorflow.python.keras.callbacks.History at 0x1dfcbfbd208>

In [117]:
model.fit(X_train_pad, y_train, batch_size=5, epochs=30, 
          validation_data=(X_test_pad, y_test), verbose=2)

Epoch 1/30
10/10 - 4s - loss: 0.5737 - accuracy: 0.7292 - val_loss: 0.4742 - val_accuracy: 0.8333
Epoch 2/30
10/10 - 4s - loss: 0.5704 - accuracy: 0.7292 - val_loss: 0.4805 - val_accuracy: 0.8333
Epoch 3/30
10/10 - 4s - loss: 0.5710 - accuracy: 0.7292 - val_loss: 0.4800 - val_accuracy: 0.8333
Epoch 4/30
10/10 - 4s - loss: 0.5761 - accuracy: 0.7292 - val_loss: 0.4811 - val_accuracy: 0.8333
Epoch 5/30
10/10 - 6s - loss: 0.5616 - accuracy: 0.7292 - val_loss: 0.4812 - val_accuracy: 0.8333
Epoch 6/30
10/10 - 6s - loss: 0.5657 - accuracy: 0.7292 - val_loss: 0.4840 - val_accuracy: 0.8333
Epoch 7/30
10/10 - 7s - loss: 0.5621 - accuracy: 0.7292 - val_loss: 0.4810 - val_accuracy: 0.8333
Epoch 8/30
10/10 - 4s - loss: 0.5629 - accuracy: 0.7292 - val_loss: 0.4685 - val_accuracy: 0.8333
Epoch 9/30
10/10 - 4s - loss: 0.5595 - accuracy: 0.7292 - val_loss: 0.4770 - val_accuracy: 0.8333
Epoch 10/30
10/10 - 6s - loss: 0.5580 - accuracy: 0.7292 - val_loss: 0.4771 - val_accuracy: 0.8333
Epoch 11/30
10/10 -

<tensorflow.python.keras.callbacks.History at 0x1dfda5c9a58>