In [None]:
import pandas as pd
import re

In [None]:
data = pd.read_csv("data/text_label_data.csv")
data.head()

#### Merge Label

In [None]:
# Create a boolean mask to identify rows to be removed
mask = (data['label'] == 'angry#non') | (data['label'] == 'non#sad') | data['label'].isnull()

# Remove rows that satisfy the mask
data = data[~mask]

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3588 entries, 0 to 3590
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3588 non-null   int64 
 1   text    3588 non-null   object
 2   label   3588 non-null   object
 3   text2   3588 non-null   object
 4   clean   3588 non-null   object
dtypes: int64(1), object(4)
memory usage: 168.2+ KB


In [None]:
from collections import Counter
Counter(data["label"])

Counter({'sad': 1417,
         'happy': 855,
         'angry': 315,
         'non': 794,
         'fear': 110,
         'surprise': 38,
         'disgust': 56})

### Feature Extraction

#### Sentiment

In [None]:
# sentiment
LIWC = pd.read_csv("LIWC.csv", sep=',')

pos = [31]
neg = [32]
for num_31 in pos:
    pos_df = LIWC[LIWC.isin([num_31]).any(1)].reset_index(drop=True)
for num_32 in neg:
    neg_df = LIWC[LIWC.isin([num_32]).any(1)].reset_index(drop=True)

In [None]:
posemo_list = []

for pos in pos_df['%']:
    posemo_list.append(pos)

In [None]:
negemo_list = []

for neg in neg_df['%']:
    negemo_list.append(neg)

In [None]:
positive = []
negative = []
neutral = []
for text in data["clean"]:
    pos_counter = 0
    neg_counter = 0
    neu = 0
    for i in str(text).split(" "):
        if i in posemo_list:
            pos_counter = pos_counter+1
    for i in str(text).split(" "):
        if i in negemo_list:
            neg_counter = neg_counter+1
    positive.append(pos_counter)
    negative.append(neg_counter)

#### Negation

In [None]:
negation = []

for i in data["clean"]:
    result = re.findall(r"^[沒無不]|\s[沒無不]| *並*未 | 並*未 *", str(i))
    negation.append(len(result))

#### Length

In [None]:
length = []

for text in data["clean"]:
    word_list = str(text).split(' ')
    length.append(len(word_list))

#### Hypothesize Word

In [None]:
hyper = []

for i in data["clean"]:
    result = re.findall(r"萬一|如果|[假倘]若|假如|要是|真要", str(i))
    hyper.append(len(result))

#### Escape Word

In [None]:
escape = []

for i in data["clean"]:
    result = re.findall(r"逃", str(i))
    escape.append(len(result))

#### entropy

In [None]:
from math import log
from collections import Counter
def shannon(list):
    counts = Counter(list)
    frequencies = ((i / len(list)) for i in counts.values())
    return - sum(f * log(f, 2) for f in frequencies)

In [None]:
entropy = []

for i in data["clean"]:
    result = shannon(Counter(i.split(" ")))
    entropy.append(result)

#### Pronoun

In [None]:
first_single = []

for i in data["clean"]:
    result = re.findall(r"( *我 )|( 我 *)", str(i))
    first_single.append(len(result))

In [None]:
first_plural = []

for i in data["clean"]:
    result = re.findall(r"( *我們 )|( 我們 *)", str(i))
    first_plural.append(len(result))

In [None]:
second_single = []

for i in data["clean"]:
    result = re.findall(r"( *[你妳] )|( [你妳] *)", str(i))
    second_single.append(len(result))

In [None]:
second_single_god = []

for i in data["clean"]:
    result = re.findall(r"( *祢 )|( 祢 *)", str(i))
    second_single_god.append(len(result))

In [None]:
second_plural = []

for i in data["clean"]:
    result = re.findall(r"( *[你妳]們 )|( [你妳]們 *)", str(i))
    second_plural.append(len(result))

In [None]:
third_single = []

for i in data["clean"]:
    result = re.findall(r"( *[他她它牠] )|( [他她它牠] *)", str(i))
    third_single.append(len(result))

In [None]:
third_plural = []

for i in data["clean"]:
    result = re.findall(r"( *[他她它牠]們 )|( [他她它牠]們 *)", str(i))
    third_plural.append(len(result))

#### Code Switch

In [None]:
code_switch = []
for i in data["clean"]:
    result = re.findall(r"[A-z]+", str(i))
    code_switch.append(len(result))

### 統整

In [None]:
feature_dict = {
    "positive": positive,
    "negative": negative,
    "negation": negation,
    "hyper": hyper,
    "escape": escape,
    "length": length,
    "entropy": entropy,
    "first_single": first_single,
    "first_plural": first_plural,
    "second_single": second_single,
    "second_single_god": second_single_god,
    "second_plural": second_plural,
    "third_single": third_single,
    "third_plural": third_plural,
    "code_switch": code_switch
}

feature_df = pd.DataFrame(feature_dict)
feature_df.head()

Unnamed: 0,positive,negative,negation,hyper,escape,length,entropy,first_single,first_plural,second_single,second_single_god,second_plural,third_single,third_plural,code_switch
0,5,13,8,1,0,174,9.194461,0,0,1,0,0,0,0,0
1,8,3,4,0,0,135,8.16986,7,0,11,0,0,0,0,0
2,3,1,1,0,0,109,7.943818,10,0,8,0,0,1,0,2
3,2,7,18,0,0,203,13.000401,12,0,19,0,0,0,0,0
4,38,7,7,2,0,341,14.63102,25,6,11,0,0,0,0,1


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

feature_scaled = pd.DataFrame(scaler.fit_transform(feature_df), columns=["positive","negative","negation","hyper","escape","length","entropy","first_single","first_plural","second_single","second_single_god","second_plural","third_single","third_plural", "code_switch"])

In [None]:
feature_scaled

Unnamed: 0,positive,negative,negation,hyper,escape,length,entropy,first_single,first_plural,second_single,second_single_god,second_plural,third_single,third_plural,code_switch
0,0.070423,0.209677,0.135593,0.05,0.0,0.221364,0.454060,0.000000,0.0,0.009091,0.0,0.0,0.000000,0.0,0.000000
1,0.112676,0.048387,0.067797,0.00,0.0,0.171171,0.412501,0.073684,0.0,0.100000,0.0,0.0,0.000000,0.0,0.000000
2,0.042254,0.016129,0.016949,0.00,0.0,0.137709,0.403333,0.105263,0.0,0.072727,0.0,0.0,0.013889,0.0,0.005510
3,0.028169,0.112903,0.305085,0.00,0.0,0.258687,0.608433,0.126316,0.0,0.172727,0.0,0.0,0.000000,0.0,0.000000
4,0.535211,0.112903,0.118644,0.10,0.0,0.436293,0.674573,0.263158,0.3,0.100000,0.0,0.0,0.000000,0.0,0.002755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3580,0.098592,0.177419,0.271186,0.00,0.0,0.238095,0.526221,0.105263,0.0,0.154545,0.0,0.0,0.041667,0.0,0.000000
3581,0.084507,0.000000,0.084746,0.00,0.0,0.230373,0.589003,0.021053,0.0,0.036364,0.0,0.0,0.000000,0.0,0.000000
3582,0.070423,0.048387,0.050847,0.00,0.0,0.114543,0.384123,0.042105,0.1,0.027273,0.0,0.0,0.000000,0.0,0.000000
3583,0.084507,0.306452,0.220339,0.20,0.0,0.364221,0.621845,0.347368,0.1,0.127273,0.0,0.0,0.125000,0.0,0.000000


In [None]:
feature_scaled.to_csv("features.csv", index=False)