In [None]:
#/usr/bin/env python
# -*- coding: UTF-8 -*-
from __future__ import division

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
from datetime import datetime
import csv
import ast
from datetime import timedelta
from pyhanlp import *
import re
from preprocess import load_stopwords
from getSentimentFromTencent import get_content
from gen_21dim_vector import get_emotion_value
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

In [None]:
"""Load Hand-Crafted Features"""
def load_features_name():
    feature_list = []
    f = open("../model/manual_feature.txt", "r")
    for line in f:
        feature_list.append(line.strip())
    f.close()
    return feature_list

## Word Segmentation

In [None]:
def hanlp_segment(all_data):

    all_data['content'] = all_data['weibo_content'].fillna('_##_')
    all_data['seg'] = None
    all_data['pos'] = None
    for weiboId in all_data.index:
        content = all_data.loc[weiboId, 'weibo_content']
        """Clean sentences"""
        try:
            content = content.replace(' ', '')  
            content = content.replace('/', '') 
        except:
            break
        """Segmentation"""
        after_seg = HanLP.segment(content)
        segs = []
        poses = []
        for term in after_seg:
            segs.append(term.word)
            poses.append(str(term.nature))
                
        seg_str = " ".join(segs)
        pos_str = " ".join(poses)
        all_data.loc[weiboId, 'seg'] = seg_str
        all_data.loc[weiboId, 'pos'] = pos_str

    return all_data

In [None]:
def cal_faces(content):
    """calculate the number of emojis"""
    try:
        co = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
    n_emoji = len(co.findall(content))
    return n_emoji

def cal_urls(content):
    """calculate the number of urls"""
    co = re.compile(u'http')
    n_url = len(co.findall(content))
    has_url = 1 if n_url > 0 else 0
    return has_url

In [None]:
def feature(all_data):

    first_pronoun = ['我', '我们']
    second_pronoun = ['你', '你们']
    third_pronoun = ['他', '他们', '她', '她们', '它', '它们']
    adversative = ['但', '但是', '然而', '却', '而', '偏偏', '只是', '不过', '至于', '致', '不料', '岂知', '可是']
    function_pos = ['d', 'p', 'c', 'u', 'e', 'o']
    question_mark = ['？','?']
    exclamation_mark = ['！','!']
    broken_point = ['，', '；', '、', '：', ',', ';', ':']
    sentence_end = ['】', '。', '？', '！', '...','.', '?', '!', ']']
    modal_particle = ['难道','决','岂','反正','也许','大约','大概','果然','居然','竟然','究竟','幸而','幸亏','偏偏','明明','恰恰','未免','只好','不妨','索性','简直','就','可','难怪','反倒','何尝','何必']
    adv_of_degree = ['很','非常','极','十分','最','顶','太','更','挺','极其','格外','分外','更加','越','越发','有点儿','稍','稍微','略微','几乎','过于','尤其']
    official_speech = ['通报', '称']
    uncertain_words = ['可能','也许','似乎','大概','或许']
    forward_reference = ['他', '他们', '她', '她们', '它', '它们','那','那些','这', '这些']
    professional_words_pos = ['g', 'gb', 'gbc', 'gc','gg','gi', 'gm', 'gp']

    high_features = list(['interactivity', 'interestingness', 'moving', 'persuasive', 'logic', 'readability', 'formality','Integrity1'])
    text_features = list(set(load_features_name()).difference(set(high_features)))

    new_data = pd.concat([all_data, pd.DataFrame(columns=text_features)])
    new_data.index.name = 'weiboId'
    for weiboId in all_data.index:

        feature_temp = dict.fromkeys(text_features, 0)
        feature_temp['sentences'] = 1
        original_content = new_data.loc[weiboId, 'weibo_content']
        feature_temp['face_num'] = cal_faces(original_content)
        feature_temp['hasUrl'] = cal_urls(original_content)

        segs = new_data.loc[weiboId, 'seg']
        poses = new_data.loc[weiboId, 'pos']
        segs = segs.split(' ')
        poses = poses.split(' ')
        
        feature_temp['sentiment_score'] = get_emotion_value(segs)
        
        for pos in poses:
            if pos == 'm':
                feature_temp['numerals'] += 1
            elif pos in function_pos:
                feature_temp['function_words'] += 1
            elif pos == 't':
                feature_temp['time_num'] += 1
            elif 'ns' in pos:
                feature_temp['place_num'] += 1
            elif 'nr' in pos:
                feature_temp['object_num'] += 1
            elif pos == 'a':
                feature_temp['adj_num'] += 1
            elif pos == 'y':
                feature_temp['modal_particle_num'] += 1
            elif pos == 'cc':
                feature_temp['conj_num'] += 1
            elif 'ry' in pos:
                feature_temp['Interrogative_pron_num'] += 1
            elif pos == 'i':
                feature_temp['idiom_num'] += 1
            elif re.match('p', pos) != None:
                feature_temp['prep_num'] += 1
            elif re.match('v', pos) != None:
                feature_temp['verb_num'] += 1
            elif re.match('d', pos) != None:
                feature_temp['adv_num'] += 1
            if pos in professional_words_pos:
                feature_temp['professional_words_num'] += 1
            if re.match('n', pos) != None:
                feature_temp['noun_num'] += 1
            if re.match('r', pos) != None:
                feature_temp['pron_num'] += 1
        characters = 0
        words = 0
        broken_nums = 0
        LW = 0

        for word in segs:
            characters = characters + len(word)
            words = words + 1
            if word == '#':
                feature_temp['tags'] += 1
                feature_temp['hasTag'] = 1
            elif word == '@':
                feature_temp['@'] += 1
                feature_temp['hasAt'] = 1
            elif word in exclamation_mark:
                feature_temp['exclamation_mark_num'] += 1
            elif word in question_mark:
                feature_temp['question_mark_num'] += 1
            elif word in first_pronoun:
                feature_temp['first_pronoun_num'] += 1
            elif word in second_pronoun:
                feature_temp['second_pronoun_num'] += 1
            elif word in third_pronoun:
                feature_temp['third_pronoun_num'] += 1
            elif word in adversative:
                feature_temp['adversative_num'] += 1
            elif word in modal_particle:
                feature_temp['modal_particle_num'] += 1
            elif word == '”' or word == '“':
                feature_temp['rhetoric_num'] += 1
            elif word in adv_of_degree:
                feature_temp['adv_of_degree_num'] += 1
            elif word in official_speech:
                feature_temp['official_speech_num'] += 1
            elif word in uncertain_words:
                feature_temp['uncertainty'] += 1
            elif word == '【':
                feature_temp['hasHead'] = 1
            if word in forward_reference:
                feature_temp['forward_reference_num'] += 1
            if len(word) > 2:
                LW += 1
            if word in sentence_end:
                feature_temp['sentences'] += 1
            elif word in broken_point:
                broken_nums += 1

        feature_temp['LW'] = LW
        feature_temp['RIX'] = LW/feature_temp['sentences']
        feature_temp['LIX'] = words/feature_temp['sentences'] + (100*LW)/words

        feature_temp['rhetoric_num'] = feature_temp['rhetoric_num'] / 2
        feature_temp['sub_sentences'] = broken_nums + feature_temp['sentences']
        feature_temp['sentence_broken'] = feature_temp['sub_sentences'] / feature_temp['sentences']
        feature_temp['characters'] = characters
        feature_temp['words'] = words
        feature_temp['average_word_length'] = characters / words

        for fea in feature_temp.keys():
            new_data.loc[weiboId, fea] = feature_temp[fea]
        print(weiboIdId,'basic feature finished')
        
    return new_data

In [None]:
def scale(all_data,text_features):
    all_data[text_features] = preprocessing.scale(all_data[text_features])
    return all_data

def normlization(all_data, text_features):
    all_data[text_features] = preprocessing.normalize(all_data[text_features], norm='l2')
    return all_data

In [None]:
def cal_high_features(all_data):
    """Calculate high level features"""
    high_features = list(['interactivity', 'interestingness', 'moving', 'persuasive', 'logic', 'readability', 'formality','Integrity1'])
    data = pd.concat([all_data, pd.DataFrame(columns=high_features)])

    for Id in data.index:
        data.loc[Id, 'interactivity'] = data.loc[Id,'question_mark_num'] + data.loc[Id,'first_pronoun_num'] + data.loc[Id,'second_pronoun_num'] + data.loc[Id,'Interrogative_pron_num']
        data.loc[Id,'interestingness'] = data.loc[Id,'rhetoric_num'] + data.loc[Id,'exclamation_mark_num'] + data.loc[Id,'face_num'] + data.loc[Id,'adj_num'] + data.loc[Id,'idiom_num'] + data.loc[Id,'adversative_num']
        data.loc[Id,'moving'] = data.loc[Id,'sentiment_score'] + data.loc[Id,'first_pronoun_num'] + data.loc[Id,'second_pronoun_num'] + data.loc[Id,'exclamation_mark_num'] + data.loc[Id,'question_mark_num'] + data.loc[Id,'adv_of_degree_num'] + data.loc[Id,'modal_particle_num']
        data.loc[Id,'persuasive'] = data.loc[Id,'numerals'] + data.loc[Id,'@'] + data.loc[Id,'official_speech_num'] + data.loc[Id,'time_num'] + data.loc[Id,'place_num'] + data.loc[Id,'object_num'] - data.loc[Id,'uncertainty']
        data.loc[Id,'logic'] = data.loc[Id,'forward_reference_num'] + data.loc[Id,'conj_num']
        data.loc[Id,'readability'] = -(data.loc[Id,'sentence_broken'] + data.loc[Id,'characters'] + data.loc[Id,'words'] + data.loc[Id,'average_word_length'] + data.loc[Id,'sentences'] + data.loc[Id,'sub_sentences'] + data.loc[Id,'professional_words_num'] + data.loc[Id,'RIX'] + data.loc[Id,'LIX'] + data.loc[Id,'LW'])
        data.loc[Id,'formality'] = data.loc[Id,'noun_num'] + data.loc[Id,'adj_num'] + data.loc[Id,'prep_num'] - data.loc[Id,'pron_num'] - data.loc[Id,'verb_num'] - data.loc[Id,'adv_num'] - data.loc[Id,'sentence_broken']
        data.loc[Id,'Integrity1'] = 2 * data.loc[Id,'hasHead'] + 2 * data.loc[Id,'hasTag'] + data.loc[Id,'hasAt'] + data.loc[Id,'hasUrl']
        
    return data

## Load Data

In [None]:
data_type = 'val'
data = pd.read_csv('../data/ictmcg_' + data_type + '.csv', header=0)
print(len(data))

In [None]:
# segmentation
data_new = hanlp_segment(data)

In [None]:
# get hand-crafted features
data_new = feature(data_new)
data_new.to_csv('../data/ictmcg_' + data_type + '_NQ.csv', index=None, sep=',', encoding='utf-8')

In [None]:
# Normalization
high_features = list(['interactivity', 'interestingness', 'moving', 'persuasive', 'logic', 'readability', 'formality','Integrity1'])
text_features = list(set(load_features_name()).difference(set(high_features)))

data_new = scale(data_new,text_features)

In [None]:
# Calculate high level features
data_new = cal_high_features(data_new)

# Normalization
data_new = scale(data_new, high_features)

In [None]:
data_new.to_csv('../data/ictmcg_' + data_type + '_NQ_scale.csv', index=None, sep=',', encoding='utf-8')