# Discrete Emotional Model for Digital Scholarship

![Baylor Libraries Banner](https://github.com/Josh-Been/Sentiment-Per-Line/blob/master/Capture.PNG?raw=true "Baylor University Libraries")

This Jupyter Notebook will guide researchers through the process of applying Valence, Arousal, and Dominance scoring system to scholarly literature.

This Jupyter Notebook was developed by the Baylor University Libraries - Digital Scholarship Program

http://blogs.baylor.edu/digitalscholarship/

02/16/2018

## First, Ensure Formic library is Installed

In [None]:
cursor='  >>  '
print cursor, 'installing formic'
# formic allows for simple subdirectory iterations
!pip install formic
# Import Python Libraries
# All included in Pandas except where specified
import os, string, formic     # formic not in pandas
import pandas as pd
import numpy as np
from Tkinter import *
from tkFileDialog import askopenfilename
from IPython.display import IFrame
from IPython.display import display, clear_output
from ipywidgets import widgets
from wordcloud import WordCloud
import matplotlib.pyplot as plt
print '\n', cursor, 'libraries fully loaded'

## Second, Browse for Zotero CSV Export & Visualize Counts Per Year

In [None]:
# Function to remove all non-ascii characters
def clean(item):
    stripped = ' '.join([x.strip(string.punctuation) for x in item.split()])
    stripped=stripped.replace('\n','')
    stripped=stripped.replace('\r','')
    stripped=stripped.replace(',','')
    stripped = (c for c in stripped if 0 < ord(c) < 127)
    return ''.join(stripped)

# function to browse for file
def browse_file():
    root = Tk()
    zotero=askopenfilename()
#     path=os.path.dirname(zotero).replace('csv','files')
    root.update()
    root.destroy()
    return zotero

# function to write Google Graphs bar chart
def bar_chart(d, name):
    f=open(os.path.dirname(zotero).replace('csv','')+'bar.html','r')
    web = f.read()
    f.close()
    html = web.split('// HERE')
    f=open(os.path.dirname(zotero).replace('csv','')+name,'w')
    f.write(html[0])
    for k, v in sorted(d.items()):
        f.write('[\"'+str(k)+'\", '+str(v)+'],')
    f.write(html[1])
    f.close()
    return name
    
# Enable all fields to view
pd.set_option('display.max_columns', None)

# Read CSV to dataframe
zotero=browse_file()
df = pd.read_csv(zotero)

# Add field totalling documents per year to dataframe
df['Year_Counts'] = df.groupby(['Publication Year'])['Key'].transform('count')

# Create dictionary of counts per year
year_counts = {}
for i, row in df.iterrows():
    year_counts[int(row['Publication Year'])]=int(row['Year_Counts'])

# Create visualization
IFrame(bar_chart(year_counts,'bar_years.html'), width=1100, height=700) 


## Third, Browse for Stop Words, Load Publication Text into Memory & Word Clouds

In [None]:
def remove_stop(document,stopwords):
    stops=[]
    stops[:]=[]
    for line in stopwords:
        stops.append(line)
    words = [word for word in document.lower().split() if (not word in stops and not word.isdigit() and not 'http' in word and len(word)>3)]
    return words

# Browse for stop word list
stopwords = browse_file()

# Create new field in dataframe
df['full_text']=''

# Create (and clear, to be safe) new lists
full_text=[]
full_text[:]=[]

pre_cr_text=[]
pre_cr_text[:]=[]

pre_sbc_text=[]
pre_sbc_text[:]=[]

post_sbc_text=[]
post_sbc_text[:]=[]

# Path to files directory containing the text files
path=os.path.dirname(zotero).replace('csv','files')
print cursor, 'Reading full text into memory'
i=0
fileset = formic.FileSet(include='**/*.txt', directory=path)
for text in fileset:
    i+=1
    f=open(text,'r')
    for row in df['File Attachments']:
        if clean(os.path.basename(text).replace('.txt','.pdf')) in clean(row):
            ftext=f.read()
            cleaned = clean(ftext)
            full_text.append(remove_stop(cleaned,stopwords))
            if df['Publication Year'][i-1] <= 1970:
                pre_cr_text.append(remove_stop(cleaned,stopwords))
            elif df['Publication Year'][i-1] <= 1990:
                pre_sbc_text.append(remove_stop(cleaned,stopwords))
            else:
                post_sbc_text.append(remove_stop(cleaned,stopwords))
            print cursor, i, '/', len(df)
            if i%50==0:
                clear_output(wait=True)
                print cursor, 'Reading full text into memory'
            break
f.close()

all_words = ' '.join(str(r) for v in full_text for r in v)
pre_cr_text_words = ' '.join(str(r) for v in pre_cr_text for r in v)
pre_sbc_text_words = ' '.join(str(r) for v in pre_sbc_text for r in v)
post_sbc_text_words = ' '.join(str(r) for v in post_sbc_text for r in v)

print '\n', cursor, 'Building Word Cloud'
df['full_text'] = full_text

print '\n', cursor, 'Entire Corpus'
print cursor, len(full_text), 'documents'
plt.rcParams["figure.figsize"] = [25,15]
plt.rcParams.update({'font.size': 22})
wordcloud = WordCloud().generate(all_words)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print '\n', cursor, '1970 and Earlier (before effects of Civil Rights Movement)'
print cursor, len(pre_cr_text), 'documents'
wordcloud = WordCloud().generate(pre_cr_text_words)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print '\n', cursor, '1980 - 1990 (After Civil Rights Movement - Before SBC Resolution)'
print cursor, len(pre_sbc_text), 'documents'
wordcloud = WordCloud().generate(pre_sbc_text_words)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print '\n', cursor, '1991 - 2000 (After SBC Resolution)'
print cursor, len(post_sbc_text), 'documents'
wordcloud = WordCloud().generate(post_sbc_text_words)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


## Fourth, Enter Words and Phrases to Search

In [None]:
def on_button_clicked(b):
    clear_output()
    phrase_search[:]=[]
    display(text)
    display(button)

def handle_submit(sender):
    phrase_search.append(text.value)
    print text.value

print 'Enter word or phrase and ENTER'
phrase_search=[]
phrase_search[:]=[]

text=widgets.Text()
display(text)
text.on_submit(handle_submit)

button=widgets.Button(description='CLEAR')
display(button)
button.on_click(on_button_clicked)

## Fifth, Browse for Discrete Emotional Ratings & Emotional Bubble Chart

In [None]:
# Browse for emotional discrete ratings list
df_emotions = pd.read_csv(browse_file())

measurables=[]
measurables[:]=[]

proximity_words=[]
proximity_words[:]=[]

aval=[]
aval[:]=[]

vval=[]
vval[:]=[]

dval=[]
dval[:]=[]

a_score=[]
a_score[:]=[]

v_score=[]
v_score[:]=[]

d_score=[]
d_score[:]=[]

p=1

print cursor, 'Processing text surrounding search word(s)'

for row in df['full_text']:
    measurables[:]=[]
    a_score[:]=[]
    v_score[:]=[]
    d_score[:]=[]
    for word in phrase_search:        
        if word in row:
            for i in range(0,len(row)-1):
                if len(row)>0:
                    if word in row[i]:
                        for a in range(1,4):
                            try:
                                measurables.append(row[i-a])     
                                n=0
                                for item in df_emotions['Word']:
                                    if row[i-a] == item:
                                        a_score.append(df_emotions['A.Mean.Sum'][n])
                                        v_score.append(df_emotions['V.Mean.Sum'][n])
                                        d_score.append(df_emotions['D.Mean.Sum'][n])
                                        break
                                    n+=1
                            except:
                                pass
                            try:
                                measurables.append(row[i+a])
                                n=0
                                for item in df_emotions['Word']:
                                    if row[i+a] == item:
                                        a_score.append(df_emotions['A.Mean.Sum'][n])
                                        v_score.append(df_emotions['V.Mean.Sum'][n])
                                        d_score.append(df_emotions['D.Mean.Sum'][n])
                                        break
                                    n+=1
                            except:
                                pass
    print cursor, p, '/', len(df)
    if p%50==0:
        clear_output(wait=True)
        print cursor, 'Processing text surrounding search word(s)'
    p+=1
    temp_list=measurables[:]
    t_a=a_score[:]
    t_v=v_score[:]
    t_d=d_score[:]
    a_temp=np.mean(t_a)
    v_temp=np.mean(t_v)
    d_temp=np.mean(t_d)
    aval.append(a_temp)
    vval.append(v_temp)
    dval.append(d_temp)
    proximity_words.append(temp_list)

# print aval
    
df['proximity']=proximity_words
df['a']=aval
df['v']=vval
df['d']=dval

df_ratings=df.groupby('Publication Year').mean()

f=open('bubble.html','r')
fb=open('line.html','r')
web=f.read()
web_b=fb.read()
f.close()
fb.close()
html = web.split('!!! HERE')
htmlb = web_b.split('!!! HERE')
f=open('bubble_emotions.html','w')
fb=open('line_emotions.html','w')
f.write(html[0])
fb.write(htmlb[0])
m=0
for year in df_ratings['Date']:
    if df_ratings['v'].iloc[m]>=0:
        if int(year)<= 1970:
            period = '1970 & earlier'
        elif int(year)<=1990:
            period = '1971 - 1990'
        else:
            period = '1991 - 2000'
        f.write('[\''+str(year)+'\', '+str(df_ratings['v'].iloc[m])+', '+str(df_ratings['a'].iloc[m])+', \''+period+'\', '+str(df_ratings['d'].iloc[m])+'],\n')
        fb.write('[\''+str(year)+'\', '+str(df_ratings['v'].iloc[m])+', '+str(df_ratings['a'].iloc[m])+', '+str(df_ratings['d'].iloc[m])+'],\n')
    m+=1
f.write(html[1])
fb.write(htmlb[1])
f.close()
fb.close()

# Create visualization
IFrame('bubble_emotions.html', width=1100, height=700) 


## Sixth, Create a Line Chart by Year

In [None]:
IFrame('line_emotions.html', width=1100, height=700)