In [1]:
import csv
import numpy as np
import gensim.models.word2vec as w2v
import string

#nltk
from nltk import pos_tag
from nltk import map_tag
from nltk import word_tokenize

#sklearn
from sklearn.feature_extraction import text 
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt
%matplotlib inline

#stats
from statistics import mean
from statistics import stdev

#metrics
import mlc_metrics as metrics

#Define Scorer for Cross-Validation
def my_custom_loss_func(ground_truth, predictions):
    return jaccard_similarity_score(ground_truth,predictions)
def my_hammloss(ground_truth, predictions):
    return metrics.mlc_hamming_loss(ground_truth, predictions)
def my_custom_f1(ground_truth, predictions):
    return metrics.mlc_f1score(ground_truth,predictions)

jaccard  = make_scorer(my_custom_loss_func, greater_is_better=True)
hammloss = make_scorer(my_hammloss)
f1score = make_scorer(my_custom_f1,greater_is_better=True)

## Data helpers

In [2]:
X = []
Y = []

def collapse_4andabove(stringlabels):
    arraylabels = []
    for i in stringlabels.strip().split(','):
        if len(i) != 0:
            label = int(i.strip())
            if label == 1 or label == 2 or label == 3:
                arraylabels.append(label)
            else:
                label = 4
                if label not in arraylabels:
                    arraylabels.append(label)
        else:
            print("Found data not annotated")
    return arraylabels

# Load a multi-label dataset
with open('datafinal/fulldata.csv', encoding='utf-8') as csvfile:
    next(csvfile, None)
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    n = 0
    for row in csvreader:
        sentence = row[2].lower()
        labels = collapse_4andabove(row[3])
        X.append(sentence)
        Y.append(labels)
        n += 1
        
mlb = MultiLabelBinarizer()
mlb.fit(Y)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [3]:
len(X),n

(7198, 7198)

In [4]:
import re
additional_stopwords = ["im","weve"]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)#add another stop words
stop_words = stop_words.union(list(string.punctuation))#add punctuation

def stopword_and_punc_removal(x):
    return " ".join(filter(lambda word: word not in stop_words, re.sub("[^a-zA-z]"," ",x).split()))
#     return x

def tag_pos(x):
#     token = TweetTokenizer().tokenize(x)
    clean = re.sub("[^a-zA-z]"," ",x)
    token = clean.split()
    pos = pos_tag(token)
    simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos]
    return simplified_tags

def tokenize(x):
    clean = re.sub("[^a-zA-z]"," ",x)
    words = clean.split()
    return words

In [5]:
import os
# print("Loading Amazon pre-trained Word2Vec")
# # Amazon_w2v = w2v.Word2Vec.load(os.path.join("AmazonW2VtrainedLowerNew","AmazonW2VtrainedLowerNew.w2v"))
# Amazon_w2v = w2v.Word2Vec.load(os.path.join("word2vec/NotIncludeDataset","NotIncludeDataset.w2v"))
# print("Word2Vec Loaded!")

def generate_embedding(w2vname,X,stopword=False,verbose=False):
    print("Loading Amazon pre-trained Word2Vec:",w2vname)
    # Amazon_w2v = w2v.Word2Vec.load(os.path.join("AmazonW2VtrainedLowerNew","AmazonW2VtrainedLowerNew.w2v"))
    path = 'word2vec/'+w2vname
    filename = w2vname+".w2v"
    w2vmodel = w2v.Word2Vec.load(os.path.join(path,filename))
    print("Word2Vec Loaded!")
    pos_filter = ['NOUN','VERB','ADV','ADJ']
    x_embedding = []
    N = len(X)
    percent_done = 0
    count_exist = 0
    count_doesnt_exist = 0
    for i in range(N):
        count = 1
        sent_vector = np.zeros(300) #initialize a dummy vector
        if(stopword):
            sent_tagged_pos = tag_pos(stopword_and_punc_removal(X[i]))
        else:
            sent_tagged_pos = tag_pos(X[i])
#         print(sent_tagged_pos)
        for word in sent_tagged_pos:
            if(word[1] in pos_filter):
                if(word[0] in w2vmodel.wv.vocab):
                    sent_vector += w2vmodel[word[0]]
                    count+=1
                    count_exist +=1
                else:
                    #try to make it lowercase- if word2vec not lower
                    if(word[0].lower() in w2vmodel.wv.vocab):
                        sent_vector += w2vmodel[word[0].lower()]
                        count+=1
                        count_exist+=1
                    else:
                        count_doesnt_exist+=1
                        
        #averaging vector
        sent_vector /= count
    
        if(np.isnan(np.min(sent_vector))):
            print("YES")
            continue
        
        x_embedding.append(sent_vector)
        
        if(verbose):
            percent_done += 1
            if(percent_done % int(0.25*N)) == 0:
                print("Progress: ",percent_done," / ",N)
    if(verbose):
        print("Done!")
        print("Words exist in W2V: ",count_exist)
        print("Words don't exist in W2V: ",count_doesnt_exist)
    print(len(x_embedding))
    return x_embedding

In [17]:
def get_cross_val_score(clf_pipeline,word2vec=False):
    Result = {'scores_HL':[],'scores_Acc':[],'scores_EMR':[],'scores_F1':[],'scores_PM':[],'scores_RM':[]}
    for i in range(10):
        print("Analzying Fold:",i+1)
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []
        file_train_name = 'datafinal/10fold/Fold_train'+str(i+1)+'.csv'
        file_test_name = 'datafinal/10fold/Fold_test'+str(i+1)+'.csv'
        print("Loading for train:",file_train_name)
        #load train fold
        with open(file_train_name, newline='', encoding='utf-8') as csvfile:
            #next(csvfile, None) # skip first header line
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
            for row in csvreader:
                X_train.append(row[2].lower())
                Y_train.append(collapse_4andabove(row[3]))
        #load test fold
        print("Loading for test:",file_test_name)
        with open(file_test_name, newline='', encoding='utf-8') as csvfile:
            #next(csvfile, None) # skip first header line
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
            for row in csvreader:
                X_test.append(row[2].lower())
                Y_test.append(collapse_4andabove(row[3]))
        if(word2vec):
#             w2vname_fold = 'NotIncludeDataset'
            w2vname = 'AmazonW2VtrainedLowerNew'
            X_train = generate_embedding(w2vname,X_train,stopword=False,verbose=False)
            X_test = generate_embedding(w2vname,X_test,stopword=False,verbose=False)
            
        Y_train = mlb.transform(Y_train)
        Y_test = mlb.transform(Y_test)
        clf_pipeline.fit(X_train,Y_train)
        y_predict = clf_pipeline.predict(X_test)
        #specify the result folder
        if(word2vec):
            metrics.writeall(Y_test,y_predict,'Results/10foldTop/w2v/Fold_'+str(i+1))
        else:
            metrics.writeall(Y_test,y_predict,'Results/10foldTop/tfidf/Fold_'+str(i+1))
        #------------------------------
        Result['scores_HL'].append(metrics.mlc_hamming_loss(Y_test, y_predict))
        Result['scores_Acc'].append(metrics.mlc_accuracy_score(Y_test, y_predict))
        Result['scores_EMR'].append(accuracy_score(Y_test, y_predict))
        Result['scores_F1'].append(metrics.mlc_f1score(Y_test, y_predict))
        Result['scores_PM'].append(precision_score(Y_test, y_predict,average='micro'))
        Result['scores_RM'].append(recall_score(Y_test, y_predict,average='micro'))
    print("=="*30)
    print("Result:"+str(Result))
    print("=="*30)
    print("Hamm Loss: %0.3f (+/- %0.3f)" % (mean(Result['scores_HL']), stdev(Result['scores_HL']) * 2))
    print("Accuracy: %0.3f (+/- %0.3f)" % (mean(Result['scores_Acc']), stdev(Result['scores_Acc']) * 2))
    print("ExactMatchRatio: %0.3f (+/- %0.3f)" % (mean(Result['scores_EMR']), stdev(Result['scores_EMR']) * 2))
    print("F1: %0.3f (+/- %0.3f)" % (mean(Result['scores_F1']), stdev(Result['scores_F1']) * 2))
    print("Precision Micro: %0.3f (+/- %0.3f)" % (mean(Result['scores_PM']), stdev(Result['scores_PM']) * 2))
    print("Recall Micro: %0.3f (+/- %0.3f)" % (mean(Result['scores_RM']), stdev(Result['scores_RM']) * 2))

# Text Classification with TF-idf using LinearSVC
Best for jaccard and F1 -> C=0.1, class_weight="balanced"
<br>
Best for subset accuracy (exact matching) -> C = 1
<br>
Including stopwords worsen the prediction

In [18]:
#for normal text classification with tf-idf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize,stop_words=None)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', OneVsRestClassifier(LinearSVC(C=1)))
#                      ('clf', OneVsRestClassifier(LinearSVC(C=0.1,class_weight='balanced')))
])

### Cross Validation Score

In [19]:
get_cross_val_score(text_clf)

Analzying Fold: 1
Loading for train: datafinal/10fold/Fold_train1.csv
Loading for test: datafinal/10fold/Fold_test1.csv
Analzying Fold: 2
Loading for train: datafinal/10fold/Fold_train2.csv
Loading for test: datafinal/10fold/Fold_test2.csv
Analzying Fold: 3
Loading for train: datafinal/10fold/Fold_train3.csv
Loading for test: datafinal/10fold/Fold_test3.csv
Analzying Fold: 4
Loading for train: datafinal/10fold/Fold_train4.csv
Loading for test: datafinal/10fold/Fold_test4.csv
Analzying Fold: 5
Loading for train: datafinal/10fold/Fold_train5.csv
Loading for test: datafinal/10fold/Fold_test5.csv
Analzying Fold: 6
Loading for train: datafinal/10fold/Fold_train6.csv
Loading for test: datafinal/10fold/Fold_test6.csv
Analzying Fold: 7
Loading for train: datafinal/10fold/Fold_train7.csv
Loading for test: datafinal/10fold/Fold_test7.csv
Analzying Fold: 8
Loading for train: datafinal/10fold/Fold_train8.csv
Loading for test: datafinal/10fold/Fold_test8.csv
Analzying Fold: 9
Loading for train: dat

# Text Classification using W2V

In [20]:
pipeline = Pipeline([
    ('clf',OneVsRestClassifier(LinearSVC(C=1)))
#     ('clf',OneVsRestClassifier(LinearSVC(C=0.1,class_weight='balanced')))
])

### Cross validation score

In [21]:
get_cross_val_score(pipeline,word2vec=True)

Analzying Fold: 1
Loading for train: datafinal/10fold/Fold_train1.csv
Loading for test: datafinal/10fold/Fold_test1.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
6478
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
720
Analzying Fold: 2
Loading for train: datafinal/10fold/Fold_train2.csv
Loading for test: datafinal/10fold/Fold_test2.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
6478
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
720
Analzying Fold: 3
Loading for train: datafinal/10fold/Fold_train3.csv
Loading for test: datafinal/10fold/Fold_test3.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
6478
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
720
Analzying Fold: 4
Loading for train: datafinal/10fold/Fold_train4.csv
Loading for test: datafinal/10fold/Fold_test4.csv
Loading Amazon pre-