In [27]:
import numpy as np
import pandas as pd  
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import  mutual_info_classif
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
#from textvec.vectorizers import TfrfVectorizer
#from textvec.vectorizers import TfIcfVectorizer
import os

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics.classification import accuracy_score, f1_score
import seaborn as sns
#from textvec import vectorizers
#from textvec.vectorizers import BaseBinaryFitter
import scipy.sparse as sp
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

global DIR
DIR = '/Users/gustavo/Downloads/TF-IDFC-RF-master/'

In [2]:
class BaseBinaryFitter(TransformerMixin):
    """Base class for supervised methods (supports only binary classification).
    Should not be used as by itself.
    ----------
    norm : 'l1', 'l2', 'max' or None, optional
        Norm used to normalize term vectors. None for no normalization.
    smooth_df : boolean or int, default=True
        Smooth df weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.
    sublinear_tf : boolean, default=False
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
    References
    ----------
    """

    def __init__(self, norm='l2', smooth_df=False, sublinear_tf=False):
        self.norm = norm
        self.smooth_df = smooth_df
        self.sublinear_tf = sublinear_tf

    def fit(self, X, y):

        n_samples, n_features = X.shape

        pos_samples = sp.spdiags(y, 0, n_samples, n_samples)
        neg_samples = sp.spdiags(1 - y, 0, n_samples, n_samples)

        self._n_pos = np.sum(y)
        self._n_neg = np.sum(1-y)
        
        X_pos = pos_samples * X
        X_neg = neg_samples * X

        tp = np.bincount(X_pos.indices, minlength=n_features)
        fp = np.sum(y) - tp
        tn = np.bincount(X_neg.indices, minlength=n_features)
        fn = np.sum(1 - y) - tn

        self._n_samples = n_samples
        self._n_features = n_features

        self._tp = tp
        self._fp = fp
        self._fn = fn
        self._tn = tn
        self._p = np.sum(y)
        self._n = np.sum(1 - y)

        if self.smooth_df:
            self._n_samples += int(self.smooth_df)
            self._tp += int(self.smooth_df)
            self._fp += int(self.smooth_df)
            self._fn += int(self.smooth_df)
            self._tn += int(self.smooth_df)
        return self

In [28]:
def readPolarityCross():
    data = pd.read_csv(DIR + 'polarity2.arff.csv',sep='##,##')  
   # df = pd.read_csv(path, encoding = "ISO-8859-1")
    #transformando as classes para binários inteiros
    data.loc[data['y']=='pos','y']=int(1)
    data.loc[data['y']=='neg','y']=int(0)
    final=pd.DataFrame({"text": data['text'], "y": data['y'].astype('int')})
    X=final['text']
    y=final['y']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=0
    )

    return X,y

In [29]:
def readSarcasmCross():
    data = pd.read_csv(DIR + 'amazon-sarcarsm-limpo3-raw.arff.csv',sep='##,##')  
    #transformando as classes para binários inteiros
    data.loc[data['y']=='ironic','y']=int(1)
    data.loc[data['y']=='regular','y']=int(0)
    final=pd.DataFrame({"text": data['text'], "y": data['y'].astype('int')})
    X=final['text']
    y=final['y']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=0
    )

    return X,y

In [30]:
def readSubjectivityCross():
    data = pd.read_csv(DIR + 'subjectivity-raw.arff.csv',sep='##,##')  
    #transformando as classes para binários inteiros
    data.loc[data['y']=='subjetivas','y']=int(1)
    data.loc[data['y']=='objetivas','y']=int(0)
    final=pd.DataFrame({"text": data['text'], "y": data['y'].astype('int')})
    X=final['text']
    y=final['y']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=0
    )

    return X,y

In [31]:
def readMovieReviewCross():
    data = pd.read_csv(DIR + 'movie-review-raw.arff.csv',sep='##,##')  
    #transformando as classes para binários inteiros
    data.loc[data['y']=='neg','y']=int(1)
    data.loc[data['y']=='pos','y']=int(0)
    final=pd.DataFrame({"text": data['text'], "y": data['y'].astype('int')})
    X=final['text']
    y=final['y']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=0
    )

    return X,y

In [7]:
def ensure_sparse_format(array, dtype=np.float64):
    if sp.issparse(array):
        if array.dtype != dtype:
            array = array.astype(dtype)
    else:
        array = sp.csr_matrix(array, dtype=dtype)
    return array

In [None]:
class TfidfcrfVectorizer1(BaseBinaryFitter):
    def transform(self, X):
        X = ensure_sparse_format(X)
        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1
        np.sqrt(X.data, X.data)     
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp
        
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=D+C
        f = self._n_features

        #k = np.log2(2 + tp / fn)
      #  N=tp+fn+tn+fp
        #print(N)
        
        #k = np.log(10+(A)/C)* (D/np.log(10+(A)/C))
       # k = np.log(10+(A)/C) * (D+B) *N *(np.log(10+(C)/B+D)) 0.8444
        #k = np.log(10+(A)/C) * (D+B) *N *(np.log(10+(C)/B))0.8435
       # k = np.log(10+(A)/C)* (D/np.log(10+(A)/C)) *(N/np.log(10+(C)/B)) 0.843
       #  k = np.log(10+(A)/C)* (D/np.log(10+(A)/C)) 0.8415
#    k = np.log(10+(A)/C)* (D/np.log(10+(A)/C)) *(N/np.log(10+(B)/D)) 0.8405
       #  k = np.log(10+(A)/C)* (B/np.log(10+(A)/C))  0.8375  
#    k = np.log(10+(A)/C)*np.log(10+(pow(D,2))/B) 0.834 -> facil de explicar
     #   k = np.log2(2 + A / C)/np.log2(2 + A / C) --> 0.8195
         #k = np.log(10+A/C) -->0.818
       # k = np.log2(2 + tp / fn) --> 0.8065
        
     #   k = np.log(10+(A)/C)* (D/np.log(10+(A)/C)) *(N/np.log(10+(C)/B))
        
        #k = np.log(10+(A)/C)* (D/np.log(10+(A)/C))*(N/np.log(10+(C)/B))
        
       # k = np.log(10+(A)/C)* D  *np.log(10+(C)/B)
        #k = np.log(10+(A)/C) * (D+B) *N *(np.log(10+(C)/B+D))
        
        
        #todos os resultados foram gerados com esse abaixo:
        #k = np.log(10+(A)/C) * (D+B) *N *(np.log(10+(C)/B+D))
        
        n_pos=self._n_pos
        n_neg=self._n_neg
        N=n_pos+n_neg
        
        #concatena A com C
        vet1 = np.concatenate(([A], [C]), axis=0)
        #qual o indice da maior classe
        w = np.argmax(vet1,axis=0)
        
        #here, 0 is positive, since A is in the first line and C in the second
        Dtotal_ti=np.where(w == 0, self._n_pos, self._n_neg)
        Dtotal_ti2=np.where(w == 1, self._n_pos, self._n_neg)    
        A=A
        B=B
        C=C
        D=D
        IDF=np.log(N/(A+C)) 
        k=np.log2(2+(np.maximum(A,C)/(2+np.minimum(A,C)))* (np.sqrt(B+D)))#(np.log2(np.maximum(A, B+D)))
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)
        return X

In [8]:
class TfIGMVectorizer(BaseBinaryFitter):
    def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp 
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=C+D
        f = self._n_features
        N=tp+fn+tn+fp
        IGM=np.maximum(A,C)/((np.maximum(A,C)*1)+(np.minimum(A,C)*2))
        k=(1 + (7*IGM));
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X

In [9]:
class SQRTTfIGMVectorizer(BaseBinaryFitter):
    def transform(self, X):
        X = ensure_sparse_format(X)
        if self.sublinear_tf:
         #   X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        np.sqrt(X.data, X.data)   
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp 
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=C+D
        f = self._n_features
        N=tp+fn+tn+fp
        IGM=np.maximum(A,C)/(((np.maximum(A,C)*1)+(np.minimum(A,C)*2)))
        k=(1 + (7*IGM));
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X

In [10]:
class TfIGMimpVectorizer(BaseBinaryFitter):
    def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1 
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp 
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=C+D
        f = self._n_features
        N=tp+fn+tn+fp
        
        #concatena A com C
        vet1 = np.concatenate(([A], [C]), axis=0)
        #qual o indice da maior classe
        w = np.argmax(vet1,axis=0)
        
        #here, 0 is positive, since A is in the first line and C in the second
        Dtotal_ti=np.where(w == 0, self._n_pos, self._n_neg)
        IGM = np.maximum(A, C) / (((np.maximum(A, C) * 1) + (np.minimum(A, C) * 2) + np.log10(Dtotal_ti / np.maximum(A, C))))
        k=(1 + (7*IGM));
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X

In [11]:
class SQRTTfIGMimpVectorizer(BaseBinaryFitter):
    def transform(self, X):
        X = ensure_sparse_format(X)
        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1 
        np.sqrt(X.data, X.data)      
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp 
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=C+D
        f = self._n_features
        N=tp+fn+tn+fp
        
        #concatena A com C
        vet1 = np.concatenate(([A], [C]), axis=0)
        #qual o indice da maior classe
        w = np.argmax(vet1,axis=0)
        
        #here, 0 is positive, since A is in the first line and C in the second
        Dtotal_ti=np.where(w == 0, self._n_pos, self._n_neg)
        IGM = np.maximum(A, C) / (((np.maximum(A, C) * 1) + (np.minimum(A, C) * 2) + np.log10(Dtotal_ti / np.maximum(A, C))))
        k=(1 + (7*IGM));
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X

In [12]:
class TfDELTAidf(BaseBinaryFitter):
    def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        tp = self._tp
        fn = self._fn
        tn=self._tn
        fp=self._fp
        A=tp
        B=fp
        D=fn
        C=tn
        NP=A+B
        NN=C+D
        f = self._n_features
        N=tp+fn+tn+fp
        k=np.log2(2+((NP+C+0.5)/(A*NN+0.5)))
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)
        return X

In [13]:
class TfrfVectorizer(BaseBinaryFitter, BaseEstimator):
    """Supervised method (supports ONLY binary classification)
    transform a count matrix to a normalized Tfrf representation
    Tf means term-frequency while RF means relevance frequency.
    Parameters
    ----------
    norm : 'l1', 'l2', 'max' or None, optional
        Norm used to normalize term vectors. None for no normalization.
    smooth_df : boolean or int, default=True
        Smooth df weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.
    sublinear_tf : boolean, default=False
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
    References
    ----------
    .. [M. Lan, C. L. Tan, J. Su, and Y. Lu] `Supervised and traditional
                term weighting methods for automatic text categorization`
    """



    def transform(self, X):
        X = ensure_sparse_format(X)
        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1

        tp = self._tp
        fn = self._fn
        tn = self._tn
        f = self._n_features

        k = np.log2(2 + (tp / np.maximum(1,tn)))

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X

In [14]:
class TfIcfVectorizer(BaseBinaryFitter,TransformerMixin, BaseEstimator):
    """Supervised method (supports multiclass) to transform
    a count matrix to a normalized Tficf representation
    Tf means term-frequency while ICF means inverse category frequency.
    Parameters
    ----------
    norm : 'l1', 'l2', 'max' or None, optional
        Norm used to normalize term vectors. None for no normalization.
    sublinear_tf : boolean, default=False
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
    References
    ----------
    .. [0] `https://arxiv.org/pdf/1012.2609.pdf`
    """



    def __init__(self, norm=None, sublinear_tf=False, smooth_df=False):
        self.norm = norm
        self.sublinear_tf = sublinear_tf
        self.smooth_df = smooth_df

    def fit(self, X, y):
        n_samples, n_features = X.shape
        pos_samples = sp.spdiags(y, 0, n_samples, n_samples)
        neg_samples = sp.spdiags(1 - y, 0, n_samples, n_samples)

        X_pos = pos_samples * X
        X_neg = neg_samples * X

        tp = np.bincount(X_pos.indices, minlength=n_features)
        fp = np.sum(y) - tp
        tn = np.bincount(X_neg.indices, minlength=n_features)
        fn = np.sum(1 - y) - tn
        if self.smooth_df:
            self._n_samples += int(self.smooth_df)
            self._tp += int(self.smooth_df)
            self._fp += int(self.smooth_df)
            self._fn += int(self.smooth_df)
            self._tn += int(self.smooth_df)

        samples = []
        self.number_of_classes = len(np.unique(y))
        for val in range(self.number_of_classes):
            class_mask = sp.spdiags(y == val, 0, n_samples, n_samples)
            samples.append(np.bincount(
                (class_mask * X).indices, minlength=n_features))
        samples = np.array(samples)
        self.corpus_occurence = np.sum(samples != 0, axis=0)
        self.k = (1+np.log(n_samples/(tp+tn))) * (1 + np.log(self.number_of_classes / self.corpus_occurence))
        self._n_features = n_features
        return self

    def transform(self, X, min_freq=1):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        f = self._n_features
        X = X * sp.spdiags(self.k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)
        return X

In [16]:
def process (num_attributes, vectorizer, cross_val_num, classifier):
  #  clf = svm.SVC(kernel='linear', C=1)
    clf = classifier
    pipe = Pipeline([('vect', CountVectorizer()),
                     ('vetorizer', vectorizer),
                     ('reduce_dim', SelectKBest(chi2, k=num_attributes)),
                     ('clf', clf)])
    scores = cross_val_score(pipe,X.values,y.values,cv=cross_val_num, n_jobs=8)
    scores
    soma=0
    for x in scores:   
        soma=soma+x
    return soma/cross_val_num

In [17]:
def processAll (array_val, vectorizer, cross_val_num, classifier):
    lst = []
    
    for x in array_val:  
        lst.append(process(x, vectorizer, cross_val_num, classifier))

    return lst


In [34]:
lst=[500,1000,2000,4000,6000,8000,10000,12000,14000]
X,y = readMovieReviewCross()#readSubjectivityCross() #readSarcasmCross() #readPolarityCross()
resultsTFIGM_svm_MR_MR = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsSQRTTFIGM_svm_MR_MR = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIGMimp_svm_MR_MR = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGMimp_svm_MR_MR = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsICF_svm_MR_MR = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsDelta_svm_MR_MR = processAll(lst, TfDELTAidf(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFRFICF_svm_MR_MR = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIDF_svm_MR_MR = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTF_svm_MR_MR = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsRF_svm_MR = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsTFIGM_mnb_MR_MR = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGM_mnb_MR_MR = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsTFIGMimp_mnb_MR_MR = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGMimp_mnb_MR_MR = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())

resultsICF_mnb_MR_MR = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsDelta_mnb_MR_MR = processAll(lst, TfDELTAidf(sublinear_tf=False), 5,MultinomialNB())
resultsTFRFICF_mnb_MR_MR = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5,MultinomialNB())
resultsTFIDF_mnb_MR_MR = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5,MultinomialNB())
resultsTF_mnb_MR_MR = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5,MultinomialNB())
resultsRF_mnb_MR_MR = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5,MultinomialNB())

print ("--SVM-- ")
print ("TFIGM - "+ str(resultsTFIGM_svm_MR_MR))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_svm_MR_MR))
print ("TFIGMimp - "+ str(resultsTFIGMimp_svm_MR_MR))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_svm_MR_MR))
print ("TFICF - "+ str(resultsICF_svm_MR_MR))
print ("TFDelta - "+ str(resultsDelta_svm_MR_MR))
print ("TFRFICF - "+ str(resultsTFRFICF_svm_MR_MR))
print ("TFIDF - "+ str(resultsTFIDF_svm_MR_MR))
print ("TF - "+ str(resultsTF_svm_MR_MR))
print ("TFRF - "+ str(resultsRF_svm_MR))

print ("--NB-- ")
print ("TFIGM - "+ str(resultsTFIGM_mnb_MR_MR))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_mnb_MR_MR))
print ("TFIGMimp - "+ str(resultsTFIGMimp_mnb_MR_MR))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_mnb_MR_MR))
print ("TFICF - "+ str(resultsICF_mnb_MR_MR))
print ("TFDelta - "+ str(resultsDelta_mnb_MR_MR))
print ("TFRFICF - "+ str(resultsTFRFICF_mnb_MR_sub))
print ("TFIDF - "+ str(resultsTFIDF_mnb_MR_sub))
print ("TF - "+ str(resultsTF_mnb_MR_sub))
print ("TFRF - "+ str(resultsRF_mnb_MR_sub))

KeyboardInterrupt: 

In [None]:
X,y = readSubjectivityCross() #readSarcasmCross() #readPolarityCross()
resultsTFIGM_svm_MR_sub = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGM_svm_MR_sub = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIGMimp_svm_MR_sub = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGMimp_svm_MR_sub = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsICF_svm_MR_sub = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsDelta_svm_MR_sub = processAll(lst, TfDELTAidf(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFRFICF_svm_MR_sub = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIDF_svm_MR_sub = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTF_svm_MR_sub = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsRF_svm_sub = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsTFIGM_mnb_MR_sub = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGM_mnb_MR_sub = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsTFIGMimp_mnb_MR_sub = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGMimp_mnb_MR_sub = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())

resultsICF_mnb_MR_sub = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsDelta_mnb_MR_sub = processAll(lst, TfDELTAidf(sublinear_tf=False), 5,MultinomialNB())
resultsTFRFICF_mnb_MR_sub = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5,MultinomialNB())
resultsTFIDF_mnb_MR_sub = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5,MultinomialNB())
resultsTF_mnb_MR_sub = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5,MultinomialNB())
resultsRF_mnb_MR_sub = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5,MultinomialNB())

print ("--SVM-- ")
print ("TFIGM - "+ str(resultsTFIGM_svm_MR_sub))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_svm_MR_sub))
print ("TFIGMimp - "+ str(resultsTFIGMimp_svm_MR_sub))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_svm_MR_sub))
print ("TFICF - "+ str(resultsICF_svm_MR_sub))
print ("TFDelta - "+ str(resultsDelta_svm_MR_sub))
print ("TFRFICF - "+ str(resultsTFRFICF_svm_MR_sub))
print ("TFIDF - "+ str(resultsTFIDF_svm_MR_sub))
print ("TF - "+ str(resultsTF_svm_MR_sub))
print ("TFRF - "+ str(resultsRF_svm_sub))

print ("--NB-- ")
print ("TFIGM - "+ str(resultsTFIGM_mnb_MR_sub))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_mnb_MR_sub))
print ("TFIGMimp - "+ str(resultsTFIGMimp_mnb_MR_sub))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_mnb_MR_sub))
print ("TFICF - "+ str(resultsICF_mnb_MR_sub))
print ("TFDelta - "+ str(resultsDelta_mnb_MR_sub))
print ("TFRFICF - "+ str(resultsTFRFICF_mnb_MR_sub))
print ("TFIDF - "+ str(resultsTFIDF_mnb_MR_sub))
print ("TF - "+ str(resultsTF_mnb_MR_sub))
print ("TFRF - "+ str(resultsRF_mnb_MR_sub))

In [None]:
X,y = readSarcasmCross() #readPolarityCross()
resultsTFIGM_svm_MR_sar = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGM_svm_MR_sar = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIGMimp_svm_MR_sar = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGMimp_svm_MR_sar = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsICF_svm_MR_sar = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsDelta_svm_MR_sar = processAll(lst, TfDELTAidf(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFRFICF_svm_MR_sar = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIDF_svm_MR_sar = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTF_svm_MR_sar = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsRF_svm_sar = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsTFIGM_mnb_MR_sar = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGM_mnb_MR_sar = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsTFIGMimp_mnb_MR_sar = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGMimp_mnb_MR_sar = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())

resultsICF_mnb_MR_sar = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsDelta_mnb_MR_sar = processAll(lst, TfDELTAidf(sublinear_tf=False), 5,MultinomialNB())
resultsTFRFICF_mnb_MR_sar = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5,MultinomialNB())
resultsTFIDF_mnb_MR_sar = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5,MultinomialNB())
resultsTF_mnb_MR_sar = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5,MultinomialNB())
resultsRF_mnb_MR_sar = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5,MultinomialNB())

print ("--SVM-- ")
print ("TFIGM - "+ str(resultsTFIGM_svm_MR_sar))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_svm_MR_sar))
print ("TFIGMimp - "+ str(resultsTFIGMimp_svm_MR_sar))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_svm_MR_sar))
print ("TFICF - "+ str(resultsICF_svm_MR_sar))
print ("TFDelta - "+ str(resultsDelta_svm_MR_sar))
print ("TFRFICF - "+ str(resultsTFRFICF_svm_MR_sar))
print ("TFIDF - "+ str(resultsTFIDF_svm_MR_sar))
print ("TF - "+ str(resultsTF_svm_MR_sar))
print ("TFRF - "+ str(resultsRF_svm_sar))

print ("--NB-- ")
print ("TFIGM - "+ str(resultsTFIGM_mnb_MR_sar))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_mnb_MR_sar))
print ("TFIGMimp - "+ str(resultsTFIGMimp_mnb_MR_sar))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_mnb_MR_sar))
print ("TFICF - "+ str(resultsICF_mnb_MR_sar))
print ("TFDelta - "+ str(resultsDelta_mnb_MR_sar))
print ("TFRFICF - "+ str(resultsTFRFICF_mnb_MR_sar))
print ("TFIDF - "+ str(resultsTFIDF_mnb_MR_sar))
print ("TF - "+ str(resultsTF_mnb_MR_sar))
print ("TFRF - "+ str(resultsRF_mnb_MR_sar))

In [None]:
X,y = readPolarityCross()
print ("Polarity dataset")
print (lst)
resultsTFIGM_svm_MR_Pol = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGM_svm_MR_Pol = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIGMimp_svm_MR_Pol = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsSQRTTFIGMimp_svm_MR_Pol = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsICF_svm_MR_Pol = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsDelta_svm_MR_Pol = processAll(lst, TfDELTAidf(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFRFICF_svm_MR_Pol = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTFIDF_svm_MR_Pol = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5, svm.SVC(kernel='linear', C=1))
resultsTF_svm_MR_Pol = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))
resultsRF_svm_Pol = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5, svm.SVC(kernel='linear', C=1))

resultsTFIGM_mnb_MR_Pol = processAll(lst, TfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGM_mnb_MR_Pol = processAll(lst, SQRTTfIGMVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsTFIGMimp_mnb_MR_Pol = processAll(lst, TfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsSQRTTFIGMimp_mnb_MR_Pol = processAll(lst, SQRTTfIGMimpVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsICF_mnb_MR_Pol = processAll(lst, TfIcfVectorizer(sublinear_tf=False), 5,MultinomialNB())
resultsDelta_mnb_MR_Pol = processAll(lst, TfDELTAidf(sublinear_tf=False), 5,MultinomialNB())
resultsTFRFICF_mnb_MR_Pol = processAll(lst, TfidfcrfVectorizer1(sublinear_tf=False), 5,MultinomialNB())
resultsTFIDF_mnb_MR_Pol = processAll(lst, TfidfTransformer(sublinear_tf=False, smooth_idf=False), 5,MultinomialNB())
resultsTF_mnb_MR_Pol = processAll(lst, TfidfTransformer(use_idf=False, sublinear_tf=False), 5,MultinomialNB())
resultsRF_mnb_MR_Pol = processAll(lst, TfrfVectorizer(sublinear_tf=False), 5,MultinomialNB())
print ("--SVM-- ")
print ("TFIGM - "+ str(resultsTFIGM_svm_MR_Pol))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_svm_MR_Pol))
print ("TFIGMimp - "+ str(resultsTFIGMimp_svm_MR_Pol))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_svm_MR_Pol))
print ("TFICF - "+ str(resultsICF_svm_MR_Pol))
print ("TFDelta - "+ str(resultsDelta_svm_MR_Pol))
print ("TFRFICF - "+ str(resultsTFRFICF_svm_MR_Pol))
print ("TFIDF - "+ str(resultsTFIDF_svm_MR_Pol))
print ("TF - "+ str(resultsTF_svm_MR_Pol))
print ("TFRF - "+ str(resultsRF_svm_Pol))

print ("--NB-- ")
print ("TFIGM - "+ str(resultsTFIGM_mnb_MR_Pol))
print ("SQRTTFIGM - "+ str(resultsSQRTTFIGM_mnb_MR_Pol))
print ("TFIGMimp - "+ str(resultsTFIGMimp_mnb_MR_Pol))
print ("SQRTTFIGMimp - "+ str(resultsSQRTTFIGMimp_mnb_MR_Pol))
print ("TFICF - "+ str(resultsICF_mnb_MR_Pol))
print ("TFDelta - "+ str(resultsDelta_mnb_MR_Pol))
print ("TFRFICF - "+ str(resultsTFRFICF_mnb_MR_Pol))
print ("TFIDF - "+ str(resultsTFIDF_mnb_MR_Pol))
print ("TF - "+ str(resultsTF_mnb_MR_Pol))
print ("TFRF - "+ str(resultsRF_mnb_MR_Pol))



Polarity dataset
[500, 1000, 2000, 4000, 6000, 8000, 10000, 12000, 14000]
