In [1]:
import csv
import pandas as pd
import numpy as np
import os

from clean_text import *

In [2]:
#change directory
with_i_text_dir = './Part/cleaned_with_i'

#positive and negative word lists obtained from here:
#http://www.wjh.harvard.edu/~inquirer/homecat.htm
pos_neg_file = 'inquirerbasic.xls'

In [3]:
#let's read in the spreadsheet, and then save the word lists
df = pd.read_excel(pos_neg_file)

df.head()

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
0,A,H4Lvd,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,SUPV,|
2,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
3,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,,SUPV,|
4,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,,Noun,


In [4]:
print(df.shape)

(11788, 186)


In [5]:
#right now we have all word lists, we need to isolate just the positive and negative words
df = df[['Entry', 'Positiv', 'Negativ', 'PosAff', 'NegAff']]

df.head()

Unnamed: 0,Entry,Positiv,Negativ,PosAff,NegAff
0,A,,,,
1,ABANDON,,Negativ,,
2,ABANDONMENT,,Negativ,,
3,ABATE,,Negativ,,
4,ABATEMENT,,,,


In [6]:
#we need to get rid of nan values, only take explicitly labeled data
positive = df['Entry'][df['Positiv']=='Positiv']
positive.dropna(inplace=True)

negative = df['Entry'][df['Negativ']=='Negativ']
negative.dropna(inplace=True)

pos_aff = df['Entry'][df['PosAff']=='PosAff']
pos_aff.dropna(inplace=True)

neg_aff = df['Entry'][df['NegAff']=='NegAff']
neg_aff.dropna(inplace=True)

#we should also convert each word list to lowercase
#and the pandas series object to a list
positive = positive.str.lower().tolist()
negative = negative.str.lower().tolist()
pos_aff = pos_aff.str.lower().tolist()
neg_aff = neg_aff.str.lower().tolist()

In [7]:
#let's check the shapes
#according to the website, there are 1915 pos, 2291 neg
len_pos = len(positive)
len_neg = len(negative)

print('There are {} positive words in the positive word list.'.format(len_pos))
print('There are {} negative words in the negative word list.'.format(len_neg), '\n')

#and there should be 126 pos_aff, 193 neg_aff words
print('There are {} positive words in the positive affect word list.'.format(len(pos_aff)))
print('There are {} negative words in the negative affect word list.'.format(len(neg_aff)), '\n')

There are 1915 positive words in the positive word list.
There are 2291 negative words in the negative word list. 

There are 126 positive words in the positive affect word list.
There are 193 negative words in the negative affect word list. 



In [8]:
#look at first five words of each list
print(positive[:5], '\n')
print(negative[:5])

['abide', 'ability', 'able', 'abound', 'absolve'] 

['abandon', 'abandonment', 'abate', 'abdicate', 'abhor']


In [9]:
#we can combine the word list for positive and pos_aff, 
#and do the same thing for negative and neg_aff
positive.extend(pos_aff)
negative.extend(neg_aff)

#now convert to a set to remove any duplicates
positive = list(set(positive))
negative = list(set(negative))

In [10]:
print('There are {} more positive words in the positive word list now.'.format(len(positive) - len_pos))
print('There are {} more negative words in the negative word list now.'.format(len(negative) - len_neg), '\n')

There are 24 more positive words in the positive word list now.
There are 27 more negative words in the negative word list now. 



It looks like we aquired a few more words in the negative category. Now let's check for common positive and negative words.

In [11]:
#check for common positive words
print('good' in positive)
print('great' in positive)
print('happy' in positive)
print('love' in positive)
print('excellent' in positive, '\n')

#check for common negative words
print('bad' in negative)
print('terrible' in negative)
print('sad' in negative)
print('depressed' in negative)
print('tired' in negative)
print('bored' in negative)
print('alone' in negative)

False
False
False
False
True 

True
True
True
False
False
False
False


In [12]:
#let's add in good, great, love, happy, and like
positive.append('good')
positive.append('great')
positive.append('love')
positive.append('happy')
positive.append('like')

#and now depressed, tired, bored, alone, plus some others
negative.append('depressed')
negative.append('tired')
negative.append('bored')
negative.append('alone')
negative.append('annoying')
negative.append('irritate')
negative.append('irritated')
negative.append('bother')
negative.append('bothered')

### Create positive and negative count columns

For this analysis, I am choosing to create a total count of positive and negative words used by each participant. Another notebook has shown that a frequency based apporach (with large, sparse vectors) did not work well in predicting PHQ-8 scores. Thus, I am trying to create my own feature that I can hopefully distinguish either binary or high and low raw scores from each other.

In [13]:
change_dir = os.chdir(with_i_text_dir)

#get current working directory
this_dir = os.getcwd()
filenames = os.listdir(this_dir)

#sort filenames for sorting vectors
filenames = sorted(filenames)

#we need the participant numbers to create TaggedDocument
participant_nums = []

#document list for inspection later on
docs = []

#positive and negative word counts lists
pos, neg = [], []
pos_freq, neg_freq = [], []

In [14]:
#need to iterate through documents
#create a list of *strings*, each one containing the transcript of a participant
for filename in filenames:
    if filename.endswith('.txt'):
        #read in file contents
        file = open(filename, 'rt')
        text = file.read()
        file.close()
        
        participant_nums.append(int(filename[:3]))
        
        #whitespace tokenize
        tokens = tokenize(text)
        
        #append tokens to document list
        docs.append(tokens)
        
        #count positive words unions
        pos_in_tokens = list(set(positive) & set(tokens))
        pos.append(len(pos_in_tokens))
        
        #do the same for negative words
        neg_in_tokens = list(set(negative) & set(tokens))
        neg.append(len(neg_in_tokens))
        
        #now let's look at positive and negative word *frequency*, not count
        count_pos = 0
        count_neg = 0
        for word in tokens:
            if word in pos_in_tokens:
                count_pos += 1
            if word in neg_in_tokens:
                count_neg += 1
                
        pos_freq.append(count_pos)
        neg_freq.append(count_neg)

In [15]:
#check out what an example of the above set-union method yields
print(list(set(docs[1]) & set(positive)), '\n')
print(list(set(docs[1]) & set(negative)))

['okay', 'outgoing', 'assistant', 'accomplish', 'creative', 'accomplishment', 'good', 'able', 'like', 'friend', 'memorable', 'best', 'positive', 'great', 'love', 'law', 'happy'] 

['prod', 'horrible', 'negative', 'difficult', 'shove', 'annoy', 'dull', 'problem', 'provoke', 'alone', 'angry', 'bad']


In [16]:
#now let's create a dataframe object and save it as a csv
dataframe = pd.DataFrame({'part_num':participant_nums,
                         'num_pos':pos,
                         'num_neg':neg, 
                         'pos_freq':pos_freq,
                         'neg_freq':neg_freq})

In [18]:
#change to main directory, save as a csv
os.chdir('../../')
dataframe.to_csv('pos_neg_counts.csv')