In [2]:
import sys, os
sys.path.append(os.path.join(sys.path[0].split('BecaNLP')[0],'BecaNLP/Utils'))

import NLPUtils as nlp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [None]:
from NLPUtils.datasets.utils import split_dev_kfolds, NgramCountVectorizer
from NLPUtils.classifiers import MultinomialNB, BernoulliNB
from sklearn.metrics import f1_score

# Baseline para Sentiment Analisis binario

## IMDb corpus

In [11]:
from NLPUtils.datasets import imdb

def vectorize_imdb_data(df_all,train_idx,dev_idx):
    train_corpus = df_all.iloc[train_idx,0]
    dev_corpus = df_all.iloc[dev_idx,0]

    vectorizer = NgramCountVectorizer(token_pattern=r'\b\w+\b',unk_token=None,
                 min_freq=1,max_freq=np.inf,ngram_range=(1,2),max_features=50000)
    X_train = vectorizer.fit_transform(train_corpus)
    X_dev = vectorizer.transform(dev_corpus)

    y_train = df_all.iloc[train_idx,1].values.copy()
    y_train[y_train < 5] = 0
    y_train[y_train > 6] = 1
    y_dev = df_all.iloc[dev_idx,1].values.copy()
    y_dev[y_dev < 5] = 0
    y_dev[y_dev > 6] = 1
    
    return X_train, y_train, X_dev, y_dev

df_all = imdb.train_reader()
k_folds = split_dev_kfolds(len(df_all),k_folds=5,random_state=12345)
train_idx, dev_idx = k_folds[0] # falta hacer un k-fold de verdad
X_train, y_train, X_dev, y_dev = vectorize_imdb_data(df_all,train_idx,dev_idx)
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
y_predict = classifier.predict(X_dev)
score = f1_score(y_dev,y_predict,average=None) # falta implementar f1_score de verdad
score

array([0.87377499, 0.86851776])

## SST Corpus

In [45]:
from NLPUtils.datasets import sst

def vectorize_sst_data(corpus_all,train_idx,dev_idx):
    train_corpus = (corpus_all[i][0] for i in train_idx)
    dev_corpus = (corpus_all[i][0] for i in dev_idx)

    vectorizer = NgramCountVectorizer(token_pattern=None,unk_token=None,
                 min_freq=1,max_freq=np.inf,ngram_range=(1,1),max_features=None)
    X_train = vectorizer.fit_transform(train_corpus)
    X_dev = vectorizer.transform(dev_corpus)

    y_train = np.array([corpus_all[i][1] for i in train_idx])
    y_dev = np.array([corpus_all[i][1] for i in dev_idx])
    
    return X_train, y_train, X_dev, y_dev

def binary_class_func(y):
    if y in ('0','1'):
        return 0
    elif y in ('3','4'):
        return 1

corpus_all = [(tree.leaves(), label) for tree, label in sst.train_reader(class_func=binary_class_func)]
k_folds = split_dev_kfolds(len(corpus_all),k_folds=5,random_state=12345)
train_idx, dev_idx = k_folds[0] # falta hacer un k-fold de verdad
X_train, y_train, X_dev, y_dev = vectorize_sst_data(corpus_all,train_idx,dev_idx)
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
y_predict = classifier.predict(X_dev)
score = f1_score(y_dev,y_predict,average=None) # falta implementar f1_score de verdad
score

array([0.75456712, 0.79522863])