In [1]:
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk import TweetTokenizer
from sklearn.utils import shuffle
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [22]:
%%time
#string to number for class
dic_aggression_level = {
    'NAG' : 1,
    'CAG' : 2,
    'OAG' : 3
}

#train data
train_pd = shuffle(pd.read_csv("train.csv")[['Data', 'Label']], random_state=20)
train_pd['Label_num'] = train_pd.Label.map(dic_aggression_level)
y_train = train_pd['Label_num']

#test data
test_pd = shuffle(pd.read_csv("valid.csv")[['Data', 'Label']], random_state=20)
test_pd['Label_num'] = test_pd.Label.map(dic_aggression_level)
y_test = test_pd['Label_num']

#tokenizer for word2vec
tokenizer = TweetTokenizer()
def create_tokens(x):
    return tokenizer.tokenize(x.lower())
word2vec_model = Word2Vec(list(map(create_tokens, train_pd['Data'])), size=100, window=5, min_count=0, workers=50)
vocabulary = set(word2vec_model.wv.vocab.keys())

tfidf_model = TfidfVectorizer(vocabulary=vocabulary, tokenizer=create_tokens)
tfidf_model.fit(train_pd['Data'])
tfidf_values = tfidf_model.transform([" ".join(vocabulary)]).toarray()[0]

CPU times: user 7.73 s, sys: 136 ms, total: 7.86 s
Wall time: 4.93 s


In [20]:
tokenizer = TweetTokenizer()
def create_tokens(x):
    return tokenizer.tokenize(x.lower())
# print(np.array(list(map(create_tokens, train_pd['Data']))))
print(create_tokens(train_pd['Data'][0]))

['well', 'said', 'sonu', '..', 'you', 'have', 'courage', 'to', 'stand', 'against', 'dadagiri', 'of', 'muslims']


In [None]:
def vectorise_sentence(sentence, method='avg'):
    global tokenizer
    tokens = tokenizer.tokenize(sentence)
    #addition
    if method=='add':
        vector = np.array([0]*word2vec_model['hate'].shape[0])
        for token in tokens:
            if token in vocabulary:
                vector = vector + word2vec_model[token]
        return vector 
    #average
    if method=='avg':
        vector = np.array([0]*word2vec_model['hate'].shape[0])
        count = 0
        for token in tokens:
            if token in vocabulary:
                vector = vector + word2vec_model[token]
                count += 1
        if count != 0:
            vector = vector / count
        return vector 
    #Average of Word2Vec vectors with TF-IDF : this is one of the best approach which I will recommend. 
    #Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it 
    #will represent your sentence vector.
    if method=='tfidf':
        vector = np.array([0]*word2vec_model['hate'].shape[0])
        count = 0
        for token in tokens:
            if token in vocabulary:
                vector = vector + (word2vec_model[token]*tfidf_values[tfidf_model.vocabulary_[token]])
                count += tfidf_values[tfidf_model.vocabulary_[token]]
        if count != 0:
            vector = vector / count
        return vector

print(vectorise_sentence(train_pd['Data'][1], 'tfidf'))

In [None]:
%%time
svm = SVC(kernel='linear')
method='avg'
svm.fit([vectorise_sentence(sentence, method) for sentence in train_pd['Data']], train_pd['Label_num'])
print("Fitted")
y_pred = svm.predict([vectorise_sentence(sentence, method) for sentence in test_pd['Data']])
print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted'))
print(confusion_matrix(y_test, y_pred))

In [None]:
%%time
svm = SVC(kernel='linear')
method='tfidf'
svm.fit([vectorise_sentence(sentence, method) for sentence in train_pd['Data']], train_pd['Label_num'])
print("Fitted")
y_pred = svm.predict([vectorise_sentence(sentence, method) for sentence in test_pd['Data']])
print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted'))
print(confusion_matrix(y_test, y_pred))