In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import twokenize
from nltk.tokenize import TweetTokenizer
from FeatureFunctions import getfeatures
import sklearn

In [2]:
### Reading train and test files

datafile = "datasets/train/SemEval2018-T3-train-taskA_emoji.txt"
# datafile2 = "datasets/train/SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt"
trainingdata = pd.read_csv(datafile, sep = "\t", header=0)

type(trainingdata['Tweet text'])
trainingdata = trainingdata[['Label','Tweet text']]
train_tweets = trainingdata['Tweet text']

testfile = 'datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt'
testdata = pd.read_csv(testfile, sep="\t", header=0)
testdata = testdata[['Label','Tweet text']]


In [3]:
### Creating 50-50% balance
## Train
amount_nonirony_train = sum(trainingdata["Label"] == 0)
amount_irony_train = sum(trainingdata["Label"] == 1)
amount_train_amount = min(amount_nonirony_train, amount_irony_train)

resulting_nonirony_train = trainingdata[trainingdata["Label"] == 0].sample(amount_train_amount)
resulting_irony_train = trainingdata[trainingdata["Label"] == 1].sample(amount_train_amount)

trainingdata = resulting_nonirony_train.append(resulting_irony_train, ignore_index=True)

## Test
amount_nonirony_test = sum(testdata["Label"] == 0)
amount_irony_test = sum(testdata["Label"] == 1)
amount_test_amount = min(amount_nonirony_test, amount_irony_test)

resulting_nonirony_test = testdata[testdata["Label"] == 0].sample(amount_test_amount)
resulting_irony_test = testdata[testdata["Label"] == 1].sample(amount_test_amount)

testdata = resulting_nonirony_test.append(resulting_irony_test, ignore_index=True)

In [13]:
### Get lexical features
# training_features
lexical_training_features, unicount_vect, bicount_vect, tricount_vect, fourcount_vect = getfeatures.getlexical(trainingdata, 'Tweet text')
x_small = lexical_training_features[['PunctuationFlood', 'CharFlood', 'CapitalizedCount', 'HashtagCount', 'Hashtag2WordRatio', 'TweetCharLength', 'TweetWordLength', 'EmojiCount', 'FinalPunctuation']].values
x_lexical = np.array(lexical_training_features.apply(lambda row: sum([row['CharFourgramVector'], row['CharTrigramVector'],row['BigramVector']], []), axis=1).values.tolist())
x_lexical = np.hstack((x_small, x_lexical))

# train_bow = lexical_training_features['UnigramVector'].values.tolist()

# test_features
lexical_test_features, unicount_vect, bicount_vect, tricount_vect, fourcount_vect = getfeatures.getlexical(testdata, 'Tweet text', unicount_vect, bicount_vect, tricount_vect, fourcount_vect)
test_x_small = lexical_test_features[['PunctuationFlood', 'CharFlood', 'CapitalizedCount', 'HashtagCount', 'Hashtag2WordRatio', 'TweetCharLength', 'TweetWordLength', 'EmojiCount', 'FinalPunctuation']].values
test_lexical_x = np.array(lexical_test_features.apply(lambda row: sum([row['CharFourgramVector'], row['CharTrigramVector'],row['BigramVector']], []), axis=1).values.tolist())
test_lexical_x = np.hstack((test_x_small, test_lexical_x))




In [25]:
train_bow = np.array(lexical_training_features['UnigramVector'].values.tolist())
test_bow = np.array(lexical_test_features['UnigramVector'].values.tolist())


array([[4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=object)

In [5]:
### Get sentiment features
train_sentiment_x = getfeatures.getaffinfeats(trainingdata['Tweet text'])
test_sentiment_x = getfeatures.getaffinfeats(testdata['Tweet text'])


In [6]:
### Combining features

whichtype = input("Which type to test: ")

if whichtype == 1:
    final_train_x = train_bow
    final_test_x = test_bow

elif whichtype == 6
    final_train_x = np.hstack((train_sentiment_x, x_lexical))
    final_test_x = np.hstack((test_sentiment_x, test_lexical_x))


In [7]:
### Import the classifier
import sys
sys.path.insert(0, 'libsvm')

from svmutil import *

In [8]:
### Train and get train error
y = trainingdata['Label'].tolist()
prob  = svm_problem(y, final_train_x)
param = svm_parameter('-t 2 -c 8 -g ' + str(2**-11))
m = svm_train(prob, param)
p_label, p_acc, p_val = svm_predict(y, final_train_x, m)
ACC, MSE, SCC = evaluations(y, p_label)

Accuracy = 92.3987% (3513/3802) (classification)


In [9]:
### Get the test
test_y = testdata['Label'].tolist()

test_p_label, test_p_acc, test_p_val = svm_predict(test_y, final_test_x, m)
test_ACC, test_MSE, test_SCC = evaluations(test_y, test_p_label)
print(sklearn.metrics.classification_report(test_y, test_p_label, target_names=["Non-irony", "Irony"], digits=4))

Accuracy = 66.2379% (412/622) (classification)
              precision    recall  f1-score   support

   Non-irony     0.6563    0.6817    0.6688       311
       Irony     0.6689    0.6431    0.6557       311

   micro avg     0.6624    0.6624    0.6624       622
   macro avg     0.6626    0.6624    0.6623       622
weighted avg     0.6626    0.6624    0.6623       622

