In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from utils.dataset import Dataset
dataset = Dataset(path='data')
training_dataset, testing_dataset = dataset.generate_hold_out_split()
raw_training_data = training_dataset.data
raw_testing_data = testing_dataset.data

In [3]:
from utils.preprocessing import preprocess_dataframe, extract_labels, decipher_labels
training_data = preprocess_dataframe(raw_training_data, 'training_data')
testing_data = preprocess_dataframe(raw_testing_data, 'testing_data')
training_labels = extract_labels(training_data)
testing_labels = extract_labels(testing_data)

In [4]:
from features.vectorizer_features import BoWVectorizer, TfidfVectorizer, LSIVectorizer, RPVectorizer, LDAVectorizer, WordVectorCentroidVectorizer, Doc2VecVectorizer
from features.similarity_features import CosineSimilarity, WMDSimilarity, WordOverlapSimilarity
from features.sentiment_features import PolarityScorer, EmotionScorer

2017-04-08 21:35:56,344 : INFO : 'pattern' package not found; tag filters are not available for English


In [5]:
training_bow = BoWVectorizer.transform(training_data, 'training_data')
testing_bow = BoWVectorizer.transform(testing_data, 'testing_data')

In [6]:
training_tfidf = TfidfVectorizer.transform(training_data, 'training_data')
testing_tfidf = TfidfVectorizer.transform(testing_data, 'testing_data')

In [7]:
training_lsi = LSIVectorizer.transform(training_data, 'training_data')
testing_lsi = LSIVectorizer.transform(testing_data, 'testing_data')

In [8]:
training_rp = RPVectorizer.transform(training_data, 'training_data')
testing_rp = RPVectorizer.transform(testing_data, 'testing_data')

In [9]:
training_lda = LDAVectorizer.transform(training_data, 'training_data')
testing_lda = LDAVectorizer.transform(testing_data, 'testing_data')

In [10]:
training_w2v = WordVectorCentroidVectorizer.transform(training_data, 'training_data')
testing_w2v = WordVectorCentroidVectorizer.transform(testing_data, 'testing_data')

In [11]:
training_d2v = Doc2VecVectorizer.transform(training_data, 'training_data')
testing_d2v = Doc2VecVectorizer.transform(testing_data, 'testing_data')

In [12]:
training_bow_cos = CosineSimilarity.transform(training_bow, 'training_bow')
testing_bow_cos = CosineSimilarity.transform(testing_bow, 'testing_bow')

In [13]:
training_tfidf_cos = CosineSimilarity.transform(training_tfidf, 'training_tfidf')
testing_tfidf_cos = CosineSimilarity.transform(testing_tfidf, 'testing_tfidf')

In [14]:
training_lsi_cos = CosineSimilarity.transform(training_lsi, 'training_lsi')
testing_lsi_cos = CosineSimilarity.transform(testing_lsi, 'testing_lsi')

In [15]:
training_rp_cos = CosineSimilarity.transform(training_rp, 'training_rp')
testing_rp_cos = CosineSimilarity.transform(testing_rp, 'testing_rp')

In [16]:
training_lda_cos = CosineSimilarity.transform(training_lda, 'training_lda')
testing_lda_cos = CosineSimilarity.transform(testing_lda, 'testing_lda')

In [17]:
training_w2v_cos = CosineSimilarity.transform(training_w2v, 'training_w2v')
testing_w2v_cos = CosineSimilarity.transform(testing_w2v, 'testing_w2v')

In [18]:
training_d2v_cos = CosineSimilarity.transform(training_d2v, 'training_d2v')
testing_d2v_cos = CosineSimilarity.transform(testing_d2v, 'testing_d2v')

In [19]:
training_wmd = WMDSimilarity.transform(training_data, 'training_data')
testing_wmd = WMDSimilarity.transform(testing_data, 'testing_data')

In [20]:
training_overlap = WordOverlapSimilarity.transform(training_data, 'training_data')
testing_overlap = WordOverlapSimilarity.transform(testing_data, 'testing_data')

In [21]:
training_polarities = PolarityScorer.transform(raw_training_data, 'training_data')
testing_polarities = PolarityScorer.transform(raw_testing_data, 'testing_data')

In [22]:
training_emotion = EmotionScorer.transform(training_data, 'training_data')
testing_emotion = EmotionScorer.transform(testing_data, 'testing_data')

In [23]:
from features import extract_features, features2matrix
training_features = extract_features(training_data, raw_training_data, 'training')
testing_features = extract_features(testing_data, raw_testing_data, 'testing')
np_training_features = features2matrix(training_features)
np_testing_features = features2matrix(testing_features)

In [24]:
# from sklearn import tree
# classifier = tree.DecisionTreeClassifier()
# classifier.fit(np_training_features.getA(), training_labels)

In [25]:
# predictions = classifier.predict(np_testing_features)

In [26]:
# from sklearn.metrics import confusion_matrix, f1_score
# print(confusion_matrix(testing_labels, predictions))
# print(f1_score(testing_labels, predictions, average=None))

In [27]:
# test_labels = raw_testing_data.copy()
# test_labels['Stance'] = decipher_labels(predictions, index=raw_testing_data.index)

In [28]:
# from utils.scoring import evaluate_submission
# evaluate_submission(raw_testing_data, test_labels)