In [2]:
import torch
import pandas as pd
import spacy
from collections import Counter
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os

In [3]:
#!python3 -m spacy download en_core_web_sm

## Read file into DataFrame

In [4]:
# read the file into DataFrame
df = pd.read_csv('./CAMEO_IDEA_labeled_data.csv')

# separate content and label
text = df['Content']
labels = df['Category Code']

## Tokenize the text

In [5]:
# funtion tokenize sentence
tokenizer = spacy.load("en_core_web_sm")
# tokenize, lemmatize the text and drop punctuations
tokenize = lambda t: [token.lemma_ for token in tokenizer(t) if not token.is_punct]

# only tokenize the text
#tokenize = lambda t: [token.text for token in tokenizer(t)]

In [6]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
for idx in range(size):
    for word in tokenize(text[idx]):
        cnt[word] += 1 

In [7]:
# filter out low-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count >= min_threshold}

In [8]:
# filter out high-frequency word
max_threshold = 100
count = {x: count for x, count in cnt.items() if count <= max_threshold}

## Split dataset into train set and test set

In [9]:
from sklearn.model_selection import train_test_split

X = np.array(text)
y = np.array(labels)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

## Prepare for word embedding

In [10]:
# download glove dictionary
# def download_glove():
#     ! wget http://nlp.stanford.edu/data/glove.6B.zip
#     ! unzip glove.6B.zip -C data
    
# download_glove()
# ! unzip glove.6B.zip

In [11]:
# load word embedding dictionary (<key=word : value=vector>)
# word2vec dictionary
def load_embedding_dict():
    embeddings_dict = {}
    with open("glove.6B.50d.txt", 'r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dic = load_embedding_dict()

### The following code is implemented by Yudan Su

#### Convert each event  into word vectors by  averaging the word embedding vectors

In [13]:
# Convert all the setences into a big matrix.
# m * d: m is the number of sentences; d is the embed size. 
def get_feature_matrix(X, glove_dic):
    sentences_matrix = []
    for line in X:
        sentences_matrix.append(sentence2vector(line, glove_dic))
    return np.array(sentences_matrix)

    
# x is a setence of words, convert it into an embedding vector of dim size
# by averaging the word embedding vectors (column-wise). 
def sentence2vector(x, glove_dic): 
    word_list = tokenize(x)
    word_matrix = []
    for word in word_list:
        if word in glove_dic:
            word_matrix.append(glove_dic[word])
    word_matrix = np.array(word_matrix)
    sentence_vector = np.mean(word_matrix, axis=0)
    return sentence_vector   

#### Build and evaluate the models

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
import collections

# evaluate MLP model on the dataset
# print metrics of accuracy, precision, and F1-score
def evaluate_model(X, y, model=LogisticRegression(random_state=0)):
    X = get_feature_matrix(X, glove_dic)
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, stratify=y)
    clf = model.fit(train_x, train_y)
    predict_y = clf.predict(test_x)
    accuracy = accuracy_score(test_y, predict_y)
#     precision = precision_score(test_y, predict_y)
#     f1score = f1_score(test_y, predict_y)
#     recall = recall_score(test_y, predict_y)
    f1_weighted = f1_score(test_y, predict_y, average='weighted')
    print("accuracy: ", accuracy)
    print("f1_weighted: ", f1_weighted)
#     print ("test_y: ",test_y)
#     print ("predict_y; ", predict_y)
        
print (X.shape)
print (collections.Counter(y))
print ("Liner classifier...")
evaluate_model(X, y)
print ("MLP...")
evaluate_model(X, y, model=MLPClassifier(hidden_layer_sizes=(64,),random_state=1, max_iter=1000))
print ("SVM...")
evaluate_model(X, y, model=SVC())


(786,)
Counter({0: 705, 1: 81})
Liner classifier...




accuracy:  0.9113924050632911
f1_weighted:  0.8866062264796443
MLP...




accuracy:  0.9050632911392406
f1_weighted:  0.8929140023489496
SVM...
accuracy:  0.8987341772151899
f1_weighted:  0.8508016877637131


  'precision', 'predicted', average, warn_for)
