In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

import numpy as np
import itertools
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#function to get data from excel file
def get_all_data():
    data = pd.read_csv('EmotionWheel3.csv')
    return data

In [3]:
#get rid of uppercase, new lines, non-alphabetical characters
def preprocess_data(data):
    for i in range(len(data)):
        data['Lyrics'][i] = data['Lyrics'][i].lower()
        data['Lyrics'][i] = data['Lyrics'][i].replace('\n',' ')
        data['Lyrics'][i] = ''.join(c for c in data['Lyrics'][i] if c.isalpha() | (c == ' '))
        #print(data['Lyrics'][i])
    return data 

In [4]:
#train model
def training_step(training_data, vectorizer):
    training_text = []
    training_result = []
    #separate features and labels
    for i in range(len(training_data)):
        training_text.append(training_data[i][0])
        training_result.append(training_data[i][1])
    training_text = vectorizer.fit_transform(training_text)   
    classifier = MultinomialNB().fit(training_text, training_result) #remove toarray for bernoulli/multi
#     print("Classifier: " + str(classifier))
#     print("Classifier log prior: " + str(classifier.class_log_prior_))
#     print("Feature log prob: " + str(classifier.feature_log_prob_))
    classifier.get_params()
    return classifier

In [5]:
#return block of text and its classifier prediction
def analyze_text(classifier, vectorizer, text):
#     print("Classifier prediction: " + str(classifier.predict(vectorizer.transform([text]))))
    return text, classifier.predict(vectorizer.transform([text]))

In [6]:
#evaluate model performance
def evaluate(evaluation_data):
    evaluation_text = []
    actual_result = []
    #separate features/labels
    for i in range(len(evaluation_data)):
        evaluation_text.append(evaluation_data[i][0])
        actual_result.append(int(evaluation_data[i][1]))
    #check accuracy of classifier on evaluation data
    total = len(evaluation_text)
    corrects = 0
    for i in range(0, total):
        result = classifier.predict(vectorizer.transform([evaluation_text[i]])) #remove toarray
        text = evaluation_text[i]
        corrects += 1 if result[0] == actual_result[i] else 0
    print("Accuracy: ", corrects * 100 / total)
    return "Accuracy: ", corrects * 100 / total

In [7]:
data = preprocess_data(get_all_data())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
#cross validation
cv = KFold(n_splits=5)
#check which vectorizer works best
v = [CountVectorizer(binary = 'true'), TfidfVectorizer(binary = 'true'), TfidfVectorizer(), CountVectorizer()]
for train_index, test_index in cv.split(data): 
    training_data = []
    evaluation_data = []
    #check for performance on specific emotion
    for indice in range(len(data)): 
        if indice in train_index:
            training_data.append([data['Lyrics'][indice],data['Surprise'][indice]])
        else: 
            evaluation_data.append([data['Lyrics'][indice],data['Disgust'][indice]])
    for vectorizer in v: 
        classifier = training_step(training_data, vectorizer)
        evaluate(evaluation_data)

Accuracy:  82.85714285714286
Accuracy:  80.0
Accuracy:  80.0
Accuracy:  82.85714285714286
Accuracy:  81.9047619047619
Accuracy:  78.0952380952381
Accuracy:  78.0952380952381
Accuracy:  77.14285714285714
Accuracy:  81.9047619047619
Accuracy:  75.23809523809524
Accuracy:  75.23809523809524
Accuracy:  81.9047619047619
Accuracy:  81.9047619047619
Accuracy:  79.04761904761905
Accuracy:  79.04761904761905
Accuracy:  81.9047619047619
Accuracy:  83.65384615384616
Accuracy:  77.88461538461539
Accuracy:  77.88461538461539
Accuracy:  81.73076923076923


In [75]:
print(classifier.feature_count_)

[[28.  2.  1. ...  0.  4.  2.]
 [ 0.  0.  0. ...  1.  5.  0.]]


In [76]:
def create_confusion_matrix(evaluation_data):
    evaluation_text     = [evaluation_data[0] for evaluation_data in evaluation_data]
    actual_result       = [int(evaluation_data[1]) for evaluation_data in evaluation_data]
    prediction_result   = []
    print(actual_result)
    for text in evaluation_text:
        analysis_result = analyze_text(classifier, vectorizer, text)
        prediction_result.append(int(analysis_result[1][0]))
    print(prediction_result)
#     print(evaluation_text[9])
    matrix = confusion_matrix(actual_result, prediction_result)
    return matrix

In [77]:
result = create_confusion_matrix(evaluation_data)
pd.DataFrame(
    result, 
    columns=["Negatives", "Positives"],
    index=["Negatives", "Positives"])

[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Unnamed: 0,Negatives,Positives
Negatives,75,6
Positives,13,10


In [12]:
def print_result(result):
    text, analysis_result = result
    print(analysis_result)
    print_text = "Positive" if analysis_result[0] == 1 else "Negative"
    print(text, ":", print_text)

In [13]:
print_result(analyze_text(classifier, vectorizer, "you are all i need coming from a guy thats on his knees at least say its fine but honestly i cant believe it that you aint leavin the world keeps testing me what can we expect i am only  you say i am alright but i am far too weak i cant believe it that you aint leavin i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliché what does it matter and did it crush your mind cause i see a strange look in your eyes and every time its there i paralyze my heart stops with beating afraid that you leavin i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliche what does it matter i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliché what does it matter do you think youre better off alone do you think youre better off alone you think youre better off alone you think youre better off alone i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliché what does it matter do you think youre better off alone do you think youre better off alone"))
print_result(analyze_text(classifier, vectorizer, "said he tried to phone me but i never have time he said that i never listen but i dont even try i got a new place in cali but im gone every night so i fill it with strangers so they keep em alive she said she told you she knows me but the face isnt right she asked if i recognized her and i told her i might see everywhere i go i got a million different people tryna kick it but im still alone in my mind i know youre dying to meet me but i can just tell you this baby as soon as you meet me youll wish that you never did youll wish that you never did i stayed a night out in paris where they dont know my name and i got into some trouble with that drink in my veins i got a problem with parties cause its loud in my brain and i can never say sorry cause i wont take the blame i know i always go missing and youre lying awake but if you ask why im distant oh im runnin away you know that everywhere i go i got a million different people tryna kick it but im still alone in my mind i know youre dying to meet me but i can just tell you this baby as soon as you meet me youll wish that you never did youll wish that you never did i i know you wanna i i know you wanna i i know you wanna slip under my armor i i know you wanna i i know you wanna i i know you wanna slip under my armor see everywhere i go i got a million different people tryna hit it but im still alone in my mind i know youre dying to meet me but i can just tell you this baby as soon as you meet me youll wish that you never did youll wish that you never did yeah yeah youll wish that you never did i know you wanna slip under my armor oh i know you wanna i i know you wanna youll wish that you never did"))
print_result(analyze_text(classifier, vectorizer, "kick mind dying to me just tell you as soon stayed paris they my name never say sorry lying awake why"))
print_result(analyze_text(classifier, vectorizer, "paris"))
print_result(analyze_text(classifier, vectorizer, "lying awake why"))
#correlation between ad cost and positive/negative reactions

[0]
you are all i need coming from a guy thats on his knees at least say its fine but honestly i cant believe it that you aint leavin the world keeps testing me what can we expect i am only  you say i am alright but i am far too weak i cant believe it that you aint leavin i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliché what does it matter and did it crush your mind cause i see a strange look in your eyes and every time its there i paralyze my heart stops with beating afraid that you leavin i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliche what does it matter i know that ive hurt you believe when i say i never meant too treat you this way deserve much better im such a cliché what does it matter do you think youre better off alone do you think youre better off alone you think youre better off alone you think youre better off alone i know that ive hurt 

In [14]:
print_result(analyze_text(classifier, vectorizer, "i cant wait"))

[0]
i cant wait : Negative


In [15]:
lexicon = pd.read_csv('EmotionLexicon.txt', sep="\t", header=None)

In [16]:
lexicon.columns=["Word", "Emotion", "Presence"]

In [17]:
lexicon = lexicon[(lexicon.Emotion != 'negative') & (lexicon.Emotion != 'positive')].reset_index(drop=True)

In [18]:
lexicon

Unnamed: 0,Word,Emotion,Presence
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
5,aback,sadness,0
6,aback,surprise,0
7,aback,trust,0
8,abacus,anger,0
9,abacus,anticipation,0


In [19]:
def changeWeights(emotionArray, newWord):
    temp = lexicon[lexicon['Word'].values == newWord].reset_index(drop=True)
    for i in range(8): 
        emotionArray[i] += temp['Presence'][i]
    emotionArray[8] += 1
    return emotionArray

In [20]:
def lexiconize(words): 
    emotionArray = [0,0,0,0,0,0,0,0,0]
    for word in words.split(): 
        if word in lexicon.values: 
            emotionPresence = changeWeights(emotionArray, word)
    return emotionArray

In [21]:
#now need to feed in data, create random forest model for one emotion that takes emotionArray as input 

In [22]:
def manipulateData(training_data, evaluation_data): 
    print(training_data)

In [23]:
X_train = []
X_eval = []
Y_train_anger = []
Y_train_anticipation = []
Y_train_joy = []
Y_train_trust = []
Y_train_fear = []
Y_train_surprise = []
Y_train_sadness = []
Y_train_disgust = []
Y_eval_anger = []
Y_eval_anticipation = []
Y_eval_joy = []
Y_eval_trust = []
Y_eval_fear = []
Y_eval_surprise = []
Y_eval_sadness = []
Y_eval_disgust = []

# print(training_data)
# print(evaluation_data)
#check for performance on specific emotion
for indice in range(len(data)): 
    print(indice)
    if indice < len(data)*0.8:         
        X_train.append(lexiconize(data['Lyrics'][indice]))
        Y_train_anger.append(data['Anger'][indice])
        Y_train_anticipation.append(data['Anticipation'][indice])
        Y_train_joy.append(data['Joy'][indice])
        Y_train_trust.append(data['Trust'][indice])
        Y_train_fear.append(data['Fear'][indice])
        Y_train_surprise.append(data['Surprise'][indice])
        Y_train_sadness.append(data['Sadness'][indice])
        Y_train_disgust.append(data['Disgust'][indice])
    else: 
        X_eval.append(lexiconize(data['Lyrics'][indice]))
        Y_eval_anger.append(data['Anger'][indice])
        Y_eval_anticipation.append(data['Anticipation'][indice])
        Y_eval_joy.append(data['Joy'][indice])
        Y_eval_trust.append(data['Trust'][indice])
        Y_eval_fear.append(data['Fear'][indice])
        Y_eval_surprise.append(data['Surprise'][indice])
        Y_eval_sadness.append(data['Sadness'][indice])
        Y_eval_disgust.append(data['Disgust'][indice])


#manipulate data for random forest 
#new_train_data, new_eval_data = manipulateData(training_data, evaluation_data)
#do random forest 
    


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

def create_and_evaluate_rf(X_train, X_eval, Y_train, Y_eval): 
    clf = RandomForestClassifier(n_estimators=5)
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_eval)
    print("Accuracy: ", metrics.accuracy_score(Y_eval, y_pred))
    matrix = confusion_matrix(Y_eval, y_pred)
    print(matrix)

In [87]:
yTrains = [Y_train_anger, Y_train_anticipation, Y_train_joy, Y_train_trust, Y_train_fear, 
           Y_train_surprise, Y_train_sadness, Y_train_disgust]
yEvals = [Y_eval_anger, Y_eval_anticipation, Y_eval_joy, Y_eval_trust, Y_eval_fear, 
           Y_eval_surprise, Y_eval_sadness, Y_eval_disgust]
for i in range(len(yTrains)): 
    create_and_evaluate_rf(X_train, X_eval, yTrains[i], yEvals[i])

Accuracy:  0.8076923076923077
[[81  7]
 [13  3]]
Accuracy:  0.5384615384615384
[[30 28]
 [20 26]]
Accuracy:  0.5288461538461539
[[40 12]
 [37 15]]
Accuracy:  0.6346153846153846
[[22 18]
 [20 44]]
Accuracy:  0.7403846153846154
[[74 11]
 [16  3]]
Accuracy:  0.7692307692307693
[[80  2]
 [22  0]]
Accuracy:  0.5865384615384616
[[51 15]
 [28 10]]
Accuracy:  0.7307692307692307
[[66 15]
 [13 10]]
