*****************************************************
### The sentiment of sentence.
### Stanford nlp

In [1]:
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
import re
from pycorenlp import StanfordCoreNLP

In [2]:
clean_score7_hs_df = pd.read_csv("./Data/clean_hs_df_7.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
clean_score8_hs_df = pd.read_csv("./Data/clean_hs_df_8.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
clean_score9_hs_df = pd.read_csv("./Data/clean_hs_df_9.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
clean_score10_hs_df = pd.read_csv("./Data/clean_hs_df_10.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
loc_score7_hs_df = pd.read_csv("./Data/loc_hs_df_7.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
loc_score8_hs_df = pd.read_csv("./Data/loc_hs_df_8.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
loc_score9_hs_df = pd.read_csv("./Data/loc_hs_df_9.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])
loc_score10_hs_df = pd.read_csv("./Data/loc_hs_df_10.csv", sep='\t', encoding="utf-8").drop("Unnamed: 0", axis=1).set_index(['home_id'])

In [3]:
nlp = StanfordCoreNLP('http://localhost:9000')

def stf_nlp_sentiment(text):
    
    if not text:
        return None
    else:
        res = nlp.annotate(text,
                           properties={
                               'annotators': 'sentiment',
                               'outputFormat': 'json'
                           }) 
        return res["sentences"][0]["sentimentValue"]

def sentiment_summary(hs_df):

    sentiment_summary_df = pd.DataFrame(index = hs_df.index, 
                                        columns=['num_sents',
                                                 'num_neg_0.5', 'num_neut_0.5', 'num_pos_0.5', 'num_very_pos_0.5', 'num_very_neg_0.5',
                                                 'num_neg_0.6', 'num_neut_0.6', 'num_pos_0.6', 'num_very_pos_0.6', 'num_very_neg_0.6',
                                                 'num_neg_0.7', 'num_neut_0.7', 'num_pos_0.7', 'num_very_pos_0.7', 'num_very_neg_0.7'
                                                ])
    for idx in hs_df.index:
        sentiment_summary_df.loc[idx][:6] = count_sentiment(hs_df, idx, 'sents_0.5')
        sentiment_summary_df.loc[idx][6:11] = count_sentiment(hs_df, idx, 'sents_0.6')
        sentiment_summary_df.loc[idx][-5:] = count_sentiment(hs_df, idx, 'sents_0.7')
    
    return sentiment_summary_df

def count_sentiment(hs_df, idx, col):
    
    num_neg, num_neut, num_pos, num_very_pos, num_very_neg = 0, 0, 0, 0, 0
    
    for s in hs_df.loc[idx][col]:
        sentiment_value = stf_nlp_sentiment(s)
        
        if sentiment_value:
            if sentiment_value == '1':
                num_neg += 1
            elif sentiment_value == '2':
                num_neut += 1
            elif sentiment_value == '3':
                num_pos += 1
            elif sentiment_value== '4':
                num_very_pos += 1
            else:
                num_very_neg += 1
        else:
            continue

    if col.endswith('5'):
        return [hs_df.loc[idx]['num_of_sents'], num_neg, num_neut, num_pos, num_very_pos, num_very_neg]
    else:
        return [num_neg, num_neut, num_pos, num_very_pos, num_very_neg]

def sim_str_to_list(hs_df, col):
    for idx in hs_df.index:
        string = hs_df.loc[idx][col].strip('[').strip(']')
        sim_string = re.split('\', |\", ', string)
        hs_df.at[idx, col] = [s.strip('\ \'').strip('\"') for s in sim_string]

def get_sentiment_summary(hs_df):
    sim_str_to_list(hs_df, 'sents_0.5')
    sim_str_to_list(hs_df, 'sents_0.6')
    sim_str_to_list(hs_df, 'sents_0.7')
    return sentiment_summary(hs_df)

In [4]:
clean_score7_sents_sentiment_df = get_sentiment_summary(clean_score7_hs_df)
clean_score8_sents_sentiment_df = get_sentiment_summary(clean_score8_hs_df)
clean_score9_sents_sentiment_df = get_sentiment_summary(clean_score9_hs_df)
clean_score10_sents_sentiment_df = get_sentiment_summary(clean_score10_hs_df)

loc_score7_sents_sentiment_df = get_sentiment_summary(loc_score7_hs_df)
loc_score8_sents_sentiment_df = get_sentiment_summary(loc_score8_hs_df)
loc_score9_sents_sentiment_df = get_sentiment_summary(loc_score9_hs_df)
loc_score10_sents_sentiment_df = get_sentiment_summary(loc_score10_hs_df)

In [21]:
clean_score7_sents_sentiment_df = pd.concat([clean_score7_hs_df, clean_score7_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
clean_score8_sents_sentiment_df = pd.concat([clean_score8_hs_df, clean_score8_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
clean_score9_sents_sentiment_df = pd.concat([clean_score9_hs_df, clean_score9_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
clean_score10_sents_sentiment_df = pd.concat([clean_score10_hs_df, clean_score10_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)

loc_score7_sents_sentiment_df = pd.concat([loc_score7_hs_df, loc_score7_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
loc_score8_sents_sentiment_df = pd.concat([loc_score8_hs_df, loc_score8_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
loc_score9_sents_sentiment_df = pd.concat([loc_score9_hs_df, loc_score9_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)
loc_score10_sents_sentiment_df = pd.concat([loc_score10_hs_df, loc_score10_sents_sentiment_df], axis=1).drop(['num_sents'], axis=1)

In [22]:
clean_score7_sents_sentiment_df.to_csv('./Data/sentiment_counts/clean_sents_stm_df_7.csv', sep='\t', encoding='utf-8')
clean_score8_sents_sentiment_df.to_csv('./Data/sentiment_counts/clean_sents_stm_df_8.csv', sep='\t', encoding='utf-8')
clean_score9_sents_sentiment_df.to_csv('./Data/sentiment_counts/clean_sents_stm_df_9.csv', sep='\t', encoding='utf-8')
clean_score10_sents_sentiment_df.to_csv('./Data/sentiment_counts/clean_sents_stm_df_10.csv', sep='\t', encoding='utf-8')
loc_score7_sents_sentiment_df.to_csv('./Data/sentiment_counts/loc_sents_stm_df_7.csv', sep='\t', encoding='utf-8')
loc_score8_sents_sentiment_df.to_csv('./Data/sentiment_counts/loc_sents_stm_df_8.csv', sep='\t', encoding='utf-8')
loc_score9_sents_sentiment_df.to_csv('./Data/sentiment_counts/loc_sents_stm_df_9.csv', sep='\t', encoding='utf-8')
loc_score10_sents_sentiment_df.to_csv('./Data/sentiment_counts/loc_sents_stm_df_10.csv', sep='\t', encoding='utf-8')