In [1]:
import csv
import numpy as np
import gensim.models.word2vec as w2v
import string

#nltk
from nltk import pos_tag
from nltk import map_tag
from nltk import word_tokenize

#sklearn
from sklearn.feature_extraction import text 
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt
%matplotlib inline

#statistics
from statistics import mean
from statistics import stdev

#metrics
import mlc_metrics as metrics

#Define Scorer for Cross-Validation
def my_custom_loss_func(ground_truth, predictions):
    return jaccard_similarity_score(ground_truth,predictions)
def my_hammloss(ground_truth, predictions):
    return metrics.mlc_hamming_loss(ground_truth, predictions)
def my_custom_f1(ground_truth, predictions):
    return metrics.mlc_f1score(ground_truth,predictions)

jaccard  = make_scorer(my_custom_loss_func, greater_is_better=True)
hammloss = make_scorer(my_hammloss)
f1score = make_scorer(my_custom_f1,greater_is_better=True)

In [14]:
allfold_Y = []
with open('datafinal/fulldata.csv', encoding='utf-8') as csvfile:
    next(csvfile, None) # skip first header line
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    n = 0
    for row in csvreader:
        if( "2" in row[3] ):
            if(row[5] == 'Inquiry'):
                allfold_Y.append('Problem Discovery')
            else:
                allfold_Y.append(row[5])
print(len(allfold_Y))
le = LabelEncoder()
le.fit(allfold_Y)

1923


AttributeError: 'LabelEncoder' object has no attribute 'inverse_labels'

## Data helpers

In [3]:
import re
additional_stopwords = ["im","weve"]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)#add another stop words
stop_words = stop_words.union(list(string.punctuation))#add punctuation
def stopword_and_punc_removal(x):
    return " ".join(filter(lambda word: word not in stop_words, re.sub("[^a-zA-z]"," ",x).split()))
#     return x
def tag_pos(x):
#     token = TweetTokenizer().tokenize(x)
    clean = re.sub("[^a-zA-z]"," ",x)
    token = clean.split()
    pos = pos_tag(token)
    simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos]
    return simplified_tags

def tokenize(x):
    clean = re.sub("[^a-zA-z]"," ",x)
    words = clean.split()
    return words

In [4]:
import os
def generate_embedding(w2vname,X,stopword=False,verbose=False):
    print("Loading Amazon pre-trained Word2Vec:",w2vname)
    # Amazon_w2v = w2v.Word2Vec.load(os.path.join("AmazonW2VtrainedLowerNew","AmazonW2VtrainedLowerNew.w2v"))
    path = 'word2vec/'+w2vname
    filename = w2vname+".w2v"
    w2vmodel = w2v.Word2Vec.load(os.path.join(path,filename))
    print("Word2Vec Loaded!")
    pos_filter = ['NOUN','VERB','ADV','ADJ']
    x_embedding = []
    N = len(X)
    percent_done = 0
    count_exist = 0
    count_doesnt_exist = 0
    for i in range(N):
        count = 1
        sent_vector = np.zeros(300) #initialize a dummy vector
        if(stopword):
            sent_tagged_pos = tag_pos(stopword_and_punc_removal(X[i]))
        else:
            sent_tagged_pos = tag_pos(X[i])
#         print(sent_tagged_pos)
        for word in sent_tagged_pos:
            if(word[1] in pos_filter):
                if(word[0] in w2vmodel.wv.vocab):
                    sent_vector += w2vmodel[word[0]]
                    count+=1
                    count_exist +=1
                else:
                    #try to make it lowercase- if word2vec not lower
                    if(word[0].lower() in w2vmodel.wv.vocab):
                        sent_vector += w2vmodel[word[0].lower()]
                        count+=1
                        count_exist+=1
                    else:
                        count_doesnt_exist+=1
                        
        #averaging vector
        sent_vector /= count
    
        if(np.isnan(np.min(sent_vector))):
            print("YES")
            continue
        
        x_embedding.append(sent_vector)
        
        if(verbose):
            percent_done += 1
            if(percent_done % int(0.25*N)) == 0:
                print("Progress: ",percent_done," / ",N)
    if(verbose):
        print("Done!")
        print("Words exist in W2V: ",count_exist)
        print("Words don't exist in W2V: ",count_doesnt_exist)
    print(len(x_embedding))
    return x_embedding

In [17]:
#Cross Validation
# from sklearn.model_selection import cross_val_score, KFold
def get_cross_val_score(clf_pipeline,word2vec=False):
    Result = {'scores_PM':[],'scores_RM':[]}
    for i in range(6):
        print("Analzying Fold:",i+1)
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []
        file_train_name = 'datafinal/6fold/Fold_train'+str(i+1)+'.csv'
        file_test_name = 'datafinal/6fold/Fold_test'+str(i+1)+'.csv'
        print("Loading for train:",file_train_name)
        #load train fold
        with open(file_train_name, newline='', encoding='utf-8') as csvfile:
            #next(csvfile, None) # skip first header line
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
            for row in csvreader:
                if( "2" in row[3] ):
                    X_train.append(row[2].lower())
                    if(row[5] == 'Inquiry'):
                        Y_train.append('Problem Discovery')
                    else:
                        Y_train.append(row[5])
        #load test fold
        print("Loading for test:",file_test_name)
        with open(file_test_name, newline='', encoding='utf-8') as csvfile:
            #next(csvfile, None) # skip first header line
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
            for row in csvreader:
                if( "2" in row[3] ):
                    X_test.append(row[2].lower())
                    if(row[5] == 'Inquiry'):
                        Y_test.append('Problem Discovery')
                    else:
                        Y_test.append(row[5])
        if(word2vec):
#           w2vname = 'NotIncludeDataset'
            w2vname = 'AmazonW2VtrainedLowerNew'
            X_train = generate_embedding(w2vname,X_train,stopword=False,verbose=False)
            X_test = generate_embedding(w2vname,X_test,stopword=False,verbose=False)
        Y_train = le.transform(Y_train)
        Y_test = le.transform(Y_test)
        clf_pipeline.fit(X_train,Y_train)
        y_predict = clf_pipeline.predict(X_test)
        if(word2vec):
            metrics.writemulticlass(Y_test,y_predict,'Results/6foldSoftware/w2v/Fold_w2v_'+str(i+1))
        else:
            metrics.writemulticlass(Y_test,y_predict,'Results/6foldSoftware/tfidf/Fold_'+str(i+1))
        Result['scores_PM'].append(precision_score(Y_test, y_predict,average='macro'))
        Result['scores_RM'].append(recall_score(Y_test, y_predict,average='macro'))
    print("=="*30)
    print("Result:"+str(Result))
    print("=="*30)
    print("Precision Micro: %0.3f (+/- %0.3f)" % (mean(Result['scores_PM']), stdev(Result['scores_PM']) * 2))
    print("Recall Micro: %0.3f (+/- %0.3f)" % (mean(Result['scores_RM']), stdev(Result['scores_RM']) * 2))

# Text Classification with TF-idf using LinearSVC
Best for jaccard and F1 -> C=0.1, class_weight="balanced"
<br>
Best for subset accuracy (exact matching) -> C = 1
<br>
Including stopwords worsen the prediction

In [18]:
#for normal text classification with tf-idf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([('vect', CountVectorizer(stop_words=None)),
                     ('tfidf', TfidfTransformer()),
                     ('clf',LinearSVC())
])

In [19]:
get_cross_val_score(text_clf)

Analzying Fold: 1
Loading for train: datafinal/6fold/Fold_train1.csv
Loading for test: datafinal/6fold/Fold_test1.csv
Analzying Fold: 2
Loading for train: datafinal/6fold/Fold_train2.csv
Loading for test: datafinal/6fold/Fold_test2.csv
Analzying Fold: 3
Loading for train: datafinal/6fold/Fold_train3.csv
Loading for test: datafinal/6fold/Fold_test3.csv
Analzying Fold: 4
Loading for train: datafinal/6fold/Fold_train4.csv
Loading for test: datafinal/6fold/Fold_test4.csv
Analzying Fold: 5
Loading for train: datafinal/6fold/Fold_train5.csv
Loading for test: datafinal/6fold/Fold_test5.csv
Analzying Fold: 6
Loading for train: datafinal/6fold/Fold_train6.csv
Loading for test: datafinal/6fold/Fold_test6.csv
Result:{'scores_PM': [0.66407187532285883, 0.70823220064724923, 0.5703474372205023, 0.64842832393704486, 0.70272653576347144, 0.69203129250480533], 'scores_RM': [0.6072545947108513, 0.67878895968783615, 0.57175200278648564, 0.6764180389624368, 0.61504476788160023, 0.59775879812353105]}
Preci

# Text Classification using W2V

In [15]:
pipeline = Pipeline([
    ('clf', LinearSVC())
])

In [16]:
get_cross_val_score(pipeline,word2vec=True)

Analzying Fold: 1
Loading for train: datafinal/6fold/Fold_train1.csv
Loading for test: datafinal/6fold/Fold_test1.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
1521
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
402
Analzying Fold: 2
Loading for train: datafinal/6fold/Fold_train2.csv
Loading for test: datafinal/6fold/Fold_test2.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
1748
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
175
Analzying Fold: 3
Loading for train: datafinal/6fold/Fold_train3.csv
Loading for test: datafinal/6fold/Fold_test3.csv
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
1829
Loading Amazon pre-trained Word2Vec: AmazonW2VtrainedLowerNew
Word2Vec Loaded!
94
Analzying Fold: 4
Loading for train: datafinal/6fold/Fold_train4.csv
Loading for test: datafinal/6fold/Fold_test4.csv
Loading Amazon pre-trained W