# Ayudantía Tarea 3 - Inteligencia Artificial

# Setup inicial

## Montar Google Drive

Visualizamos el directorio

In [0]:
!ls

sample_data


Montamos Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Ahora podemos ver Google Drive montado y podemos explorarlo como si fueran carpetas locales

In [0]:
!ls

gdrive	sample_data


In [0]:
!ls 'gdrive/My Drive/Doctorado/Ramos'

'Deep Learning'        'Text Mining'
'INF-522 Text mining'  'Tópicos Avanzados de Inteligencia Artificial'


# Preparación de datos

## Bajar Dataset

Descargamos y descomprimimos el dataset. En este caso desde Dropbox.
En caso de subirlo a Google Drive, deben montar la unidad en Colab para poder acceder al archivo como si fuera un archivo local.

In [0]:
!wget https://www.dropbox.com/url/to/dataset/news-headlines-dataset-for-sarcasm-detection.zip.zip
!unzip news-headlines-dataset-for-sarcasm-detection.zip

## Cargar datos

In [0]:
import json

def parse_data(filename):
  # TO DO: abrir el archivo 'filename' y convertir cada línea en un diccionario, usando:
  # item = json.loads(line_clean_text)
  items = []
  return items

data = parse_data('./Sarcasm_Headlines_Dataset_v2.json')

In [0]:
x_in = [x['headline'].replace("'","").replace(":","") for x in data]
y = [x['is_sarcastic'] for x in data]

El siguiente código imprime una muestra de los 10 primeros elementos no-sarcásticos y sarcásticos encontrados en el dataset.

In [0]:
print('Muestra de algunos elementos del dataset:')
sample = ['-(NOT SARCASTIC) '+x_in[i] for i in range(len(x_in)) if not y[i]][:10]
sample = sample + ['-(SARCASTIC)     '+x_in[i] for i in range(len(x_in)) if y[i]][:10]
print('\n'+'\n'.join(sample))

El siguiente código muestra la cantidad de elementos por clase. Notar que sum(y) entrega la cantidad de elementos sarcásticos pues en el caso contrario la etiqueta es cero.

In [0]:
# Total de muestras
tot_registros = len(y)
tot_sarcasticos = sum(y)
tot_no_sarcasticos = tot_registros - tot_sarcasticos
print('total de registros no sarcasticos: '+str(tot_no_sarcasticos))
print('total de registros sarcasticos: '+str(tot_sarcasticos))

total de registros no sarcasticos: 14985
total de registros sarcasticos: 13634


## Transformación a Vector de Frecuencias

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# supongamos que x_train es el set de entrenamiento
x_train = ['first train sentence', 'another sentence with words like dog and cat', 'another more']
vectorizer = CountVectorizer(x_train) # make object of Count Vectorizer
features_train = vectorizer.fit_transform(x_train) # generar dataset de training Bow

# supongamos que x_test es el set de test
x_test = ['first test sentence (a sentence with unknown words like asassadasd)', 'another sentence', 'another more']
features_test = vectorizer.transform(x_test)

Podemos ver que en lugar de ser un vector comun y corriente, está almacenado como un vector "sparse" que solamente almacena las coordenadas y valores de aquellos elementos distintos de cero (que debiensen ser la minoria). En el siguiente ejemplo, el índice 0 indica que estamos observando el primer elemento

In [0]:
print(features_train[0,:])

  (0, 4)	1
  (0, 8)	1
  (0, 7)	1


In [0]:
print(features_test[0,:])

  (0, 4)	1
  (0, 5)	1
  (0, 7)	2
  (0, 9)	1
  (0, 10)	1


## Transformación a Vector basado en Word2Vec

In [0]:
!gdown --id 0B7XkCwpI5KDYNlNUTTlSS21pQmM
!gunzip GoogleNews-vectors-negative300.bin.gz

Downloading...
From: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/GoogleNews-vectors-negative300.bin.gz
1.65GB [00:20, 80.5MB/s]


Visualizamos la carpeta para ver el nombre del archivo descomprimido

In [0]:
!ls

gdrive
GoogleNews-vectors-negative300.bin
news-headlines-dataset-for-sarcasm-detection.zip
sample_data
Sarcasm_Headlines_Dataset.json
Sarcasm_Headlines_Dataset_v2.json


In [0]:
import gensim
import numpy as np

# creamos el modelo word2vec
path = 'GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# separamos el texto en palabras y quitamos aquellas no encontradas
sentence = 'this is an example sentence with unknown words like adfdesfdutd'
sentence_words = [word for word in sentence.split() if word in model.vocab]

# convertimos cada palabra en vector usando word2vec
vectors_sequence = [model.wv[word] for word in sentence_words]

# convertimos la secuencia (de largo variable) de vectores en un unico vector.
sentence_vector = np.mean(vectors_sequence, axis=0)

  """


In [0]:
print(sentence_vector.shape)

(300,)


## Eliminación de Stop-Words

In [0]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'whoever', 'five', 'becoming', 'am', 'whether', 'yet', 'under', 'none', 'she', 'yours', 'me', 'serious', 'while', 'wherever', 'namely', 'using', "'ll", 'hereupon', 'her', 'meanwhile', 'take', 'each', 'formerly', 'front', 'again', 'almost', 'full', 'yourself', 'be', 'just', 'another', 'become', 'toward', 'himself', 'seemed', 'a', 'thence', 'why', 'besides', 'bottom', 'must', 'now', 'hundred', 'really', 'where', 'it', 'if', 'out', 'down', 'around', 'is', 'that', 'were', 're', 'wherein', 'also', 'however', 'hereby', 'regarding', 'to', 'towards', 'eight', 'you', 'who', 'but', 'perhaps', 'thereupon', 'was', 'whom', 'enough', 'n’t', 'only', 'empty', "'re", 'alone', 'seem', 'nor', "'ve", '’m', 'at', 'then', 'third', 'with', 'yourselves', 'though', 'four', 'done', 'off', 'any', 'here', 'onto', 'them', "'s", '’s', 'first', 'except', 'doing', 'due', 'often', 'twelve', 'anyhow', 'same', "'m", 'very', 'over', 'herself', 'made', 'three', 'ca', 'no', 'will', 'anyway', 'because', 'their', 'has', '’l

In [0]:
text = 'this is a sample text for the AI Course'
clean_text = ' '.join([word for word in text.split() if word.lower() not in STOP_WORDS])
print(clean_text)

sample text AI Course


# Ejemplo del uso

## Split dataset

In [0]:
from sklearn.datasets import load_iris

iris = load_iris()

iris_X = iris.data
iris_y = iris.target

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.2)

In [0]:
X_train.shape

(120, 4)

In [0]:
X_test.shape

(30, 4)

In [0]:
y_train.shape

(120,)

In [0]:
y_test.shape

(30,)

## Entrenar modelo Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
  y_predict = clf.predict(X_test)

In [0]:
print(y_predict)
print(y_test)

[0 1 1 0 1 0 1 0 0 2 0 2 0 0 2 2 1 0 2 0 0 0 1 2 2 2 0 1 0 1]
[0 1 1 0 2 0 1 0 0 2 0 1 0 0 2 2 2 0 2 0 0 0 1 2 2 2 0 1 0 1]


## Cálculo de métricas

In [0]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9