## 读取数据

In [1]:
with open("source.md", "r", encoding="utf-8") as fp:
    text = fp.read()

## 匹配答案

In [2]:
import re

# 样式：
# #### (1) [[眼中钉](https://www.zhihu.com/people/yan-zhong-ding-98)] [赞同: 47939]
answerPtn = re.compile("#### \([0-9]{1,5}\) .*? \[赞同: [0-9]{1,8}]")

answers = answerPtn.split(text)
answers = answers[1:]

## 数据分析

### 1. 为数据标记：男/女

In [3]:
male, female = 0, 0
male_total, female_total = 0, 0

# 遍历答案，计数
answers_labeled = []
for i, answer in enumerate(answers):
    
    # 对单个答案中的"TA"计数
    male_count = answer.count("他")
    female_count = answer.count("她")
    
    # 增加TA的总数
    male_total += male_count
    female_total += female_count
    
    # 增加带TA的回答的总数
    if  male_count > female_count:
        male += 1
    else:
        female += 1
    
    # 为数据打上标记，男为0，女为1
    answers_labeled.append([answer, int(male_count < female_count)])
        
print(male, female, male/(male+female))
print(male_total, female_total, male_total/(male_total+female_total))

4637 18696 0.1987314104487207
59296 200068 0.22862078006199782


### 2. 调用百度API，对内容进行情感分析

In [None]:
import time
import requests
import json
import pymysql

token = "*********************" # 请自行注册百度智能云账号获取token【免费】
url = f'https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify?charset=UTF-8&access_token={token}'
sleepTime = 1

# 该API的QPS限制为2，即每秒只能发送2次请求，这导致获取情感倾向的时间较长（回答数量18000+）。
# 因此我们将读取到的内容写入数据库，并为答案编号，便于在中断程序后下次能直接从未请求过的答案开始继续请求。

# 【数据库】账号root，密码123，操作的数据库名为zhihu，表名为zhihu
# 将读获得的情感倾向以及对应的答案编号写入数据库
def write_into_database(i, n, label):
    connect = pymysql.connect(host="localhost", user="root", password="123", port=3306, db="zhihu")
    cursor = connect.cursor()
    cursor.execute(f"insert into zhihu (i, neg, label) values ({i}, {n}, {label})")
    connect.commit()
    cursor.close()
    connect.close()

# 获取当前已经请求到编号为多少的答案了
def get_process():
    connect = pymysql.connect(host="localhost", user="root", password="123", port=3306, db="zhihu")
    cursor = connect.cursor()
    cursor.execute(f"select max(i) from zhihu")
    result = cursor.fetchall()
    connect.commit()
    cursor.close()
    connect.close()
    if result[0][0] == None:
        return 0
    return result[0][0]

# 将回答内容传给百度API，获得情感倾向信息
def getRes(answer_labeled):
    i, answer, label = answer_labeled
    len_answer = len(answer)
    if len_answer > 1000:
        answer = answer[:999]
    if len_answer < 20:
        return
    time.sleep(0.5)

    data = {
        'text': answer
    }
    data = json.dumps(data)
    try:
        resp = requests.post(url, data=data)
        res = json.loads(resp.text)["items"][0]
    except:
        print(f"{i} {resp.text}")
        return

    p, n, c, s = res["positive_prob"], res["negative_prob"], res["confidence"], res["sentiment"]

    # 如果confidence<0.9或者负面指数小于0.8，我们不要该数据。因为
    if c < 0.9 or n < 0.8:
        return

    # 将读取到的内容写入数据库
    print(i, n, label)
    write_into_database(i, n, label)

# 主程序
if __name__ == "__main__":
    current = get_process()
    print(f"从第{current}个开始。")
    for answer_labeled in answers_labeled[current:]:
        getRes(answer_labeled)
        
# 获取数据完成后，将在Ana_Sentiment.ipynb文件中进行数据可视化

0.999995 1
0.999855 1
0.999181 0
0.999472 1
0.991788 0
0.999992 1
0.998083 0
0.999581 0
0.99401 1
0.966849 1
0.988343 1
0.999921 0
0.996428 1
0.996699 0
0.99926 0
0.991598 1
0.956613 1
0.998797 1
0.998011 1
0.999771 0
0.99952 1
0.971047 0
0.999886 1
0.999974 1
0.988913 1
0.986465 1
0.99007 0
0.999718 1
0.999938 0
0.997017 0
0.994897 0
0.996841 1
0.974761 1
0.991397 1
0.999976 1
0.999436 1
0.999698 1
0.999636 1
0.96314 0
0.998583 1
0.999996 1
0.99979 0
0.993715 1
0.997079 0
0.978055 1
0.991217 0
0.999937 0
0.958184 0
0.981572 1
0.999329 1
0.999991 0
0.999611 1
0.994186 0
0.969023 0
0.985629 1
0.995664 1
0.999648 0
0.9979130.957445 1
 0
0.998426 0
0.998547 0
0.97238 1
0.999869 1
0.99983 1
0.997164 1
0.999994 1
0.999722 1
0.959826 0
0.9999190.998699  0
1
0.999953 1
0.999997 1
0.999677 1
0.99747 1
