In [1]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import lexApr as lex
import CSR
import subset
import random

In [3]:
#Open the training dataset
training_set = "../Data/train_data_red.csv"
train_data = pd.read_csv(training_set, engine='python')
    
#dropna drops missing values(not available)
train_data = train_data.dropna(axis=0)
print (train_data.sentiment.unique())

#GET THE RAW FEATURES

X = train_data.content
#change the value of sentiment from string to int
y = pd.Categorical(pd.factorize(train_data.sentiment)[0])
print (y.unique())

#Clean the data and replace each seperator with a single fullstop
#also we use POS tagging for emoticons, urls , @-mentions and hashtags
print ("POS Tagging and cleaning...")

X = [re.sub(r'[^\x00-\x7f]',r' ',s) for s in X]             #remove non-ascii characters
X = [re.sub(r'https?:\/\/[^ ]*',r'URL',s) for s in X]       #replace urls
#replace the negative or positive emoticons with tags
pos_regex = '[:;]-?[)Dp]+|<3'
neg_regex = ':-?\'?[(/Oo]+'
X = [re.sub(pos_regex, ' posE ',s) for s in X]
X = [re.sub(neg_regex, ' negE ',s) for s in X]

X = [re.sub(r'[.,!;?:]+',r'. ',s) for s in X]             #replace seperators for tokenization
X = [re.sub(r'#[^ ]*',r'HASHTAG', s) for s in X]          #replace hashtags
X = [re.sub(r'@[^ ]*',r'AT_MENTION', s) for s in X]       #replace @-mentions
X = [re.sub("[^A-Za-z_.' ]+",r' ', s) for s in X]

#Tokenize each microblog text into sentences
print ("Sentence tokenization...")
X = [word_tokenize(s) for s in X]

#find is sentences contain conjuction words
#if they do split them and save the position of the conjuction word

#open conjuctions.txt and save each word to list
word_list = [line.rstrip('\n') for line in open("../Data/conjunctions.txt")]

XX = []
for tweet in X:
    tmp = []
    s = ''
    for word in tweet:
        if word in word_list:
            if s != '':
                tmp.append(s)
            tmp.append(word)
            s = word
        elif word == '.':
            if s != '':
                tmp.append(s)
            s = ''
        else:
            if s == '':
                s += word
            else:
                s += " " + word
    if s != '':
        tmp.append(s)
    XX.append(tmp)

#-----------------PART 1-----------------#
#lexicon-based method
emo = lex.lex(XX)

ruleitems = []
Xvals = []
for i,v in enumerate(XX):
    tmp = []
    for j,sent in enumerate(v):
        if sent in word_list:
            tmp.append(sent)
        else:
            if emo[i][j][0] == emo[i][j][1]:
                tmp.append(str(emo[i][j][0]))
            else:
                tmp.append((str(emo[i][j][0]), str(emo[i][j][1])))
    Xvals.append(tmp)
    ruleitems.append([tmp,y[i]])


['empty' 'sadness' 'neutral' 'surprise' 'happiness' 'anger']
[0, 1, 2, 3, 4, 5]
Categories (6, int64): [0, 1, 2, 3, 4, 5]
POS Tagging and cleaning...
Sentence tokenization...
LEXICON-BASED METHOD WITH LEXICON: NRC-Emotion-Lexicon-v0.92-English.csv
Opening the lexicon...
Tokenizing...
Creating the tuple...
Finding the sentiment...


In [4]:
#-----------------PART 2-----------------#
#Minning Class Sequential Rules
csrs = CSR.CSR_apriori(ruleitems[:100], 0.0001, 0.005)
print(csrs)
Xtrain = []
for x in Xvals:
    Xtrain.append([1 if subset.subset(i,x) else 0 for i in csrs])

Running CSR-apriori on data with minsup 0.01% and minconf 0.5%...
[('5', '0'), 'either', '0', ('4', '0'), 'for', ('5', '1'), ('5', '4'), ('1', '0'), 'so', ('4', '0'), 'if', 'but', 'so', ('5', '1'), '0', 'or', ('1', '5'), 'cause', ('4', '1'), ('4', '3'), 'for', ('1', '5'), 'when', 'if', 'for', 'so', ('5', '1'), ('1', '3'), 'for', '0', ('4', '3'), ('3', '0'), 'til', ('4', '0'), 'til', ('5', '0'), ('1', '0'), 'because', '0', 'but', ('4', '1'), ('5', '4'), '0', (('1', '0'), '0'), (('1', '0'), 'but'), (('1', '0'), 'either'), (('1', '5'), ('4', '0')), (('1', '5'), '0'), (('1', '5'), 'so'), (('4', '0'), ('5', '0')), (('4', '0'), '0'), (('4', '0'), 'but'), (('4', '0'), 'til'), (('4', '0'), '0'), (('4', '0'), 'so'), (('4', '0'), '0'), (('4', '1'), '0'), (('4', '3'), '0'), (('4', '3'), 'but'), (('4', '3'), 'for'), (('4', '3'), '0'), (('5', '0'), '0'), (('5', '0'), '0'), (('5', '0'), 'for'), (('5', '0'), 'or'), (('5', '0'), 'when'), (('5', '1'), '0'), (('5', '1'), 'so'), (('5', '1'), '0'), (('5',

In [11]:
 #SVM on all data
import numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import svm

#split the training data in order to get a cross validation set
xtr, xcv, ytr, ycv = train_test_split(Xtrain, y, random_state=42, test_size=0.2)
print(len(xtr),len(ytr),len(xcv),len(ycv))

TypeError: take_nd() got an unexpected keyword argument 'axis'

In [None]:
#fit the svm
#using linear kernel
svc = svm.SVC(kernel='linear', C=1, probability=True)
svc = svc.fit(xtr, ytr)