# Red neuronal para clasificar cursos en lenguajes de programación

In [1]:
# !pip install nltk
# !pip install tensorflow
# !pip install keras
# !pip install numpy

import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jlgarcia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jlgarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
training_data = [{"class": "html", "sentence": "HTML First"},
                 {"class": "html", "sentence": "HTML Fundamentals"},
                 {"class": "JavaScript", "sentence": "Vue.js Basics"},
                 {"class": "JavaScript", "sentence": "What a Beautiful Vue!"},
                 {"class": "JavaScript", "sentence": "Introducing Vue"},
                 {"class": "JavaScript", "sentence": "JavaScript Array Iteration Methods"},
                 {"class": "JavaScript", "sentence": "JavaScript Unit Testing"},
                 {"class": "CSS", "sentence": "CSS Basics"},
                 {"class": "CSS", "sentence": "CSS Fundamentals"},
                 {"class": "CSS", "sentence": "CSS Fundamentals"},
                 {"class": "CSS", "sentence": "Enhancing the Design With CSS"},
                 {"class": "C++", "sentence": "C++ Tutorial"},
                 {"class": "PHP", "sentence": "PHP Basics"},
                 {"class": "PHP", "sentence": "What a Beautiful PHP"},
                 {"class": "PHP", "sentence": "Introducing PHP"},
                 {"class": "PHP", "sentence": "PHP Array Iteration Methods"},
                 {"class": "PHP", "sentence": "PHP Unit Testing"}]

print("{} sentences in training data".format(len(training_data)))

17 sentences in training data


In [3]:

classes = [doc['class'] for doc in training_data]
sentences = [doc['sentence'] for doc in training_data]
unique_classes = list(set(classes))

print('{} classes: {}'.format(len(unique_classes), unique_classes))
print('{} sentences'.format(len(sentences)))

5 classes: ['PHP', 'CSS', 'JavaScript', 'C++', 'html']
17 sentences


In [4]:
ignore_words=['basics', 'fundamentals', 'introducing','a', 'what','basics', 'beautiful']

In [5]:
# Procesamos frases
import nltk 
from nltk.corpus import stopwords
import string

def get_number_of_words(list_of_list_of_words):
    return len([item for sublist in list_of_list_of_words for item in sublist])


#unique_words_before_cleaning = list(set([word for word in sentence for sententence in sentences]))
words_before_cleaning = sum([len(word) for word in sentences])
print('Words before cleaning: {}'.format(words_before_cleaning))

# Tokenizamos

def tokenize_sentence(sentence):
    return nltk.word_tokenize(sentence)

tokenized_sentences = list(map(tokenize_sentence, sentences))

# Pasar a minúsculas



lower_sentences = [list(map(lambda x: x.lower(), words)) for words in tokenized_sentences]

print('Words when lower: {}'.format(get_number_of_words(lower_sentences)))

# Eliminar puntuación
def remove_punctuation(words):
    return [word for word in words if word not in string.punctuation]

without_punctuation_sentences = list(map(remove_punctuation, lower_sentences))

print('Words when remove punctuation: {}'.format(get_number_of_words(without_punctuation_sentences)))
      
# Transformar caracteres con acentos a caracteres sin acentos, etc?

# Eliminar stopwords
ignore_words = stopwords.words('english')+ignore_words

def delete_stopwords(words):
    return [word for word in words if word not in ignore_words]

clean_sentences = list(map(delete_stopwords, without_punctuation_sentences))

print('Words when remove stopwords: {}'.format(get_number_of_words(clean_sentences)))


processed_sentences = clean_sentences

Words before cleaning: 304
Words when lower: 48
Words when remove punctuation: 47
Words when remove stopwords: 31


In [6]:
print(processed_sentences)

[['html', 'first'], ['html'], ['vue.js'], ['vue'], ['vue'], ['javascript', 'array', 'iteration', 'methods'], ['javascript', 'unit', 'testing'], ['css'], ['css'], ['css'], ['enhancing', 'design', 'css'], ['c++', 'tutorial'], ['php'], ['php'], ['php'], ['php', 'array', 'iteration', 'methods'], ['php', 'unit', 'testing']]


In [7]:
unique_words = list(set([item for sublist in processed_sentences for item in sublist]))

print('Palabras únicas: {}'.format(len(unique_words)))
print(unique_words)


Palabras únicas: 16
['php', 'testing', 'vue.js', 'javascript', 'design', 'c++', 'methods', 'first', 'enhancing', 'unit', 'css', 'iteration', 'array', 'vue', 'tutorial', 'html']


In [8]:
from keras.preprocessing.text import Tokenizer

# create the tokenizer
t = Tokenizer()

# fit the tokenizer on the documents
t.fit_on_texts(processed_sentences)
d = Tokenizer()
d.fit_on_texts(classes)

Using TensorFlow backend.


In [9]:
# Example
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

OrderedDict([('html', 2), ('first', 1), ('vue.js', 1), ('vue', 2), ('javascript', 2), ('array', 2), ('iteration', 2), ('methods', 2), ('unit', 2), ('testing', 2), ('css', 4), ('enhancing', 1), ('design', 1), ('c++', 1), ('tutorial', 1), ('php', 5)])
17
{'php': 1, 'css': 2, 'html': 3, 'vue': 4, 'javascript': 5, 'array': 6, 'iteration': 7, 'methods': 8, 'unit': 9, 'testing': 10, 'first': 11, 'vue.js': 12, 'enhancing': 13, 'design': 14, 'c++': 15, 'tutorial': 16}
defaultdict(<class 'int'>, {'first': 1, 'html': 2, 'vue.js': 1, 'vue': 2, 'array': 2, 'javascript': 2, 'iteration': 2, 'methods': 2, 'testing': 2, 'unit': 2, 'css': 4, 'enhancing': 1, 'design': 1, 'c++': 1, 'tutorial': 1, 'php': 5})


In [10]:
# integer encode documents and classes
encoded_docs = t.texts_to_matrix(processed_sentences, mode='count')
encoded_classes = d.texts_to_matrix(classes, mode='count')
print(encoded_classes)


[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [11]:
training_data = np.array(encoded_docs, "float32")




In [12]:
# y estos son los resultados que se obtienen (en el mismo orden)
target_data = np.array(encoded_classes, "float32")

In [13]:
from keras.models import Sequential  # Permite crear modelos secuenciales
from keras.layers.core import Dense  # Permite crear capas de tipo "dense"

In [14]:
model = Sequential()

In [15]:
model.add(Dense(32, input_dim=len(encoded_docs[0]), activation='relu'))

In [16]:
model.add(Dense(len(encoded_classes[0]), activation='sigmoid'))

In [17]:
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['binary_accuracy'])

In [18]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                576       
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 198       
Total params: 774
Trainable params: 774
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model.fit(training_data, target_data, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500


Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
E

Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 

Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 

Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 

Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 

Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.History at 0x7fa191e1fb38>

In [20]:
scores = model.evaluate(training_data, target_data)
print("{}: {}".format(model.metrics_names[1], scores[1]*100))

binary_accuracy: 100.0


In [21]:
sentence = "html from one to zero"

In [22]:
from itertools import chain


def process_sentence(sentence):
    tokenized_sentence = tokenize_sentence(sentence)
    lower_sentence = [word.lower() for word in tokenized_sentence]
    without_punctuation_sentence = remove_punctuation(lower_sentence)
    clean_sentence = delete_stopwords(without_punctuation_sentence)

    processed_sentence = [word for word in clean_sentence if word in [word.lower() for word in chain(*tokenized_sentences)]]
    processed_sentences = [processed_sentence]

    d = Tokenizer()
    d.fit_on_texts(processed_sentences)
    print(processed_sentences)

    encoded_docs = t.texts_to_matrix(processed_sentences, mode='count')
    print(encoded_docs)

    processed_sentence = np.array(encoded_docs, "float32")
    return processed_sentence

In [23]:
processed_sentence = process_sentence(sentence)

[['html']]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [24]:
print(model.predict(processed_sentence).round())

[[0. 0. 0. 0. 1. 0.]]


In [25]:
from keras.models import model_from_json

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Saved model to disk
Loaded model from disk
