In [1]:
import pandas as pd
import numpy as np
import random

In [45]:
train_raw = pd.read_csv("train_dataset_v2.tsv", sep='\t')
test_raw = pd.read_csv("test_dataset.tsv", sep='\t')
train_raw.head(10)

Unnamed: 0,id,content,character,emotions
0,1171_0001_A_1,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,o2,0
1,1171_0001_A_2,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,c1,0
2,1171_0001_A_3,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,o2,0
3,1171_0001_A_4,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,c1,0
4,1171_0001_A_5,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,o2,0
5,1171_0001_A_6,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,c1,0
6,1171_0001_A_7,c1开心地点了点头。,c1,10000
7,1171_0001_A_8,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清...,o2,0
8,1171_0001_A_9,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清...,c1,0
9,1171_0001_A_10,c1再次微笑着点头，然后举手敬礼，但是手的形状却是弯的。,c1,10000


In [46]:
def get_script_scene(train_raw):
    train_raw["script"] = train_raw["id"].str.split("_").str[0].astype(int)
    train_raw["scene"] = train_raw["id"].str.split("_").str[1].astype(int)
    train_raw["sent_id"] = train_raw["id"].str.split("_").str[3].astype(int)
    train_raw["script_scene"] = train_raw["id"].str.split("_").str[0] + "_" + train_raw["id"].str.split("_").str[1] + "_" + train_raw["id"].str.split("_").str[2]
    train_raw.sort_values(by=["script", "scene", "sent_id"], inplace=True)
    train_raw.reset_index(inplace=True)
    return train_raw

def get_content_map(train_raw):
    script_scene_content_dict = dict()
    script_scene_id_dict = dict()
    for script_scene, item in train_raw.groupby(by=["script_scene"]):
        sent_list = item["content"].unique().tolist()
        sent_id_dict = {sent: i for i, sent in enumerate(sent_list)}
        script_scene_content_dict[script_scene] = sent_list
        script_scene_id_dict[script_scene] = sent_id_dict
    return script_scene_content_dict, script_scene_id_dict

def get_content_pre(train_raw):
    index_list = list()
    pre_content_list = list()
    pre_pre_content_list = list()
    script_scene_content_dict, script_scene_id_dict = get_content_map(train_raw)
    for script_scene, group_item in train_raw.groupby(by=["script_scene"]):
        sent_list = script_scene_content_dict[script_scene]
        sent_id_dict = script_scene_id_dict[script_scene]
        for i in group_item.index:
            sent_id = group_item.loc[i, "sent_id"]
            content = group_item.loc[i, "content"]

            sent_unid = sent_id_dict[content]

            if sent_unid < 1:
                pre_pre_content = "[START]"
                pre_content = "[START]"
            elif sent_unid < 2:
                pre_pre_content = "[START]"
                pre_content = sent_list[sent_unid-1]
            else:
                pre_pre_content = sent_list[sent_unid-2]
                pre_content = sent_list[sent_unid-1]

            index_list.append(i)
            pre_content_list.append(pre_content)
            pre_pre_content_list.append(pre_pre_content)
    train_raw.loc[index_list, "pre_content"] = pre_content_list
    train_raw.loc[index_list, "pre_pre_content"] = pre_pre_content_list
    return train_raw

In [47]:
train_raw = get_script_scene(train_raw)
train_raw = get_content_pre(train_raw)

In [48]:
train_raw = train_raw[["id", "script_scene", "content", "character", "emotions", "pre_content", "pre_pre_content"]]
train_raw.to_csv("train_dataset_scene_v1.tsv", sep='\t', index=False)
train_raw.head(10)

Unnamed: 0,id,script_scene,content,character,emotions,pre_content,pre_pre_content
0,1171_0001_A_1,1171_0001_A,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,o2,0,[START],[START]
1,1171_0001_A_2,1171_0001_A,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,c1,0,[START],[START]
2,1171_0001_A_3,1171_0001_A,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,o2,0,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,[START]
3,1171_0001_A_4,1171_0001_A,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,c1,0,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,[START]
4,1171_0001_A_5,1171_0001_A,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,o2,0,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。
5,1171_0001_A_6,1171_0001_A,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,c1,0,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。
6,1171_0001_A_7,1171_0001_A,c1开心地点了点头。,c1,10000,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。
7,1171_0001_A_8,1171_0001_A,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清...,o2,0,c1开心地点了点头。,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...
8,1171_0001_A_9,1171_0001_A,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清...,c1,0,c1开心地点了点头。,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...
9,1171_0001_A_10,1171_0001_A,c1再次微笑着点头，然后举手敬礼，但是手的形状却是弯的。,c1,10000,o2凑近c1小声：办入伍证审的时候，派出所的民警跟我说，你的亲生父亲还在劳改，但是你跟他划清...,c1开心地点了点头。


In [55]:
train_raw = train_raw[~train_raw["emotions"].isna()].reset_index(drop=True)
indexes = train_raw.index.tolist()
random.seed(10)
random.shuffle(indexes)
train_inds = indexes[:30000]
valid_inds = indexes[30000:]
train_raw.loc[train_inds].to_csv("train_dataset_scene_v1_train.csv", index=False)
train_raw.loc[valid_inds].to_csv("train_dataset_scene_v1_valid.csv", index=False)

In [61]:
test_raw = get_script_scene(test_raw)
test_raw = get_content_pre(test_raw)
test_raw = test_raw[["id", "script_scene", "content", "character", "pre_content", "pre_pre_content"]]
test_raw.to_csv("test_dataset_scene_v1.tsv", sep='\t', index=False)
test_raw.to_csv("test_dataset_scene_v1.csv", index=False)
test_raw.head(10)

Unnamed: 0,id,script_scene,content,character,pre_content,pre_pre_content
0,1597_0001_A_1,1597_0001_A,会议室的投影仪上播放着h2在实验室做油霸实验的画面。,h2,[START],[START]
1,1597_0001_A_2,1597_0001_A,h2坐在录像机前戴上眼镜，调整了一下摄像机的位置。,h2,会议室的投影仪上播放着h2在实验室做油霸实验的画面。,[START]
2,1597_0001_A_3,1597_0001_A,h2：油霸实验，一月十五号，第五百七十三次，第六组。,h2,h2坐在录像机前戴上眼镜，调整了一下摄像机的位置。,会议室的投影仪上播放着h2在实验室做油霸实验的画面。
3,1597_0001_A_4,1597_0001_A,h2用滴管将几滴黑色的液体滴到了一个装着半杯油的杯子里，然后转身离开了，不一会儿，黑色的液体...,h2,h2：油霸实验，一月十五号，第五百七十三次，第六组。,h2坐在录像机前戴上眼镜，调整了一下摄像机的位置。
4,1597_0001_A_5,1597_0001_A,一个实验人员走过来看了看：油霸成功了。,,h2用滴管将几滴黑色的液体滴到了一个装着半杯油的杯子里，然后转身离开了，不一会儿，黑色的液体...,h2：油霸实验，一月十五号，第五百七十三次，第六组。
5,1597_0001_A_6,1597_0001_A,h2将视频暂停，转身面对会议桌，桌子最那头，坐着h2的妻子k2，偌大的会议室只有他们俩。,h2,一个实验人员走过来看了看：油霸成功了。,h2用滴管将几滴黑色的液体滴到了一个装着半杯油的杯子里，然后转身离开了，不一会儿，黑色的液体...
6,1597_0001_A_7,1597_0001_A,h2将视频暂停，转身面对会议桌，桌子最那头，坐着h2的妻子k2，偌大的会议室只有他们俩。,k2,一个实验人员走过来看了看：油霸成功了。,h2用滴管将几滴黑色的液体滴到了一个装着半杯油的杯子里，然后转身离开了，不一会儿，黑色的液体...
7,1597_0001_A_8,1597_0001_A,h2：油霸成功了，再生能源一旦上市你知道意味着什么吗？我跟纯度风投公司已经全部都谈好了，星期...,h2,h2将视频暂停，转身面对会议桌，桌子最那头，坐着h2的妻子k2，偌大的会议室只有他们俩。,一个实验人员走过来看了看：油霸成功了。
8,1597_0001_A_9,1597_0001_A,k2有些不高兴：h2，我们俩在一起十二年了，在我跟你提出离婚的时候，你第一个反应居然是不要影...,k2,h2：油霸成功了，再生能源一旦上市你知道意味着什么吗？我跟纯度风投公司已经全部都谈好了，星期...,h2将视频暂停，转身面对会议桌，桌子最那头，坐着h2的妻子k2，偌大的会议室只有他们俩。
9,1597_0001_A_10,1597_0001_A,h2：你也知道我为这个项目付出了多少。,h2,k2有些不高兴：h2，我们俩在一起十二年了，在我跟你提出离婚的时候，你第一个反应居然是不要影...,h2：油霸成功了，再生能源一旦上市你知道意味着什么吗？我跟纯度风投公司已经全部都谈好了，星期...


In [62]:
# "love", "joy", "fright", "anger", "fear", "sorrow"
train_raw["love"] = train_raw["emotions"].str.split(",").str[0].astype(float)
train_raw["joy"] = train_raw["emotions"].str.split(",").str[1].astype(float)
train_raw["fright"] = train_raw["emotions"].str.split(",").str[2].astype(float)
train_raw["anger"] = train_raw["emotions"].str.split(",").str[3].astype(float)
train_raw["fear"] = train_raw["emotions"].str.split(",").str[4].astype(float)
train_raw["sorrow"] = train_raw["emotions"].str.split(",").str[5].astype(float)
train_raw.head()

Unnamed: 0,id,script_scene,content,character,emotions,pre_content,pre_pre_content,love,joy,fright,anger,fear,sorrow
0,1171_0001_A_1,1171_0001_A,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,o2,0,[START],[START],0.0,0.0,0.0,0.0,0.0,0.0
1,1171_0001_A_2,1171_0001_A,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,c1,0,[START],[START],0.0,0.0,0.0,0.0,0.0,0.0
2,1171_0001_A_3,1171_0001_A,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,o2,0,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,[START],0.0,0.0,0.0,0.0,0.0,0.0
3,1171_0001_A_4,1171_0001_A,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,c1,0,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,[START],0.0,0.0,0.0,0.0,0.0,0.0
4,1171_0001_A_5,1171_0001_A,o2停下来接过c1手里的行李：你妈妈交待我了，等领了军装一定要照张相寄回去，让街坊邻居都知道...,o2,0,o2一手拿着一个行李，一路小跑着把c1带到了文工团门口。,天空下着暴雨，o2正在给c1穿雨衣，他自己却只穿着单薄的军装，完全暴露在大雨之中。,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
train_raw[train_raw["love"] > 0].shape

(1282, 15)

In [20]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [55]:
stop_words = set()
with open("cn_stopwords.txt", 'r', encoding="utf8") as f:
    for word in f.readlines():
        stop_words.add(word[:-1])
stop_words

{'$',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '?',
 '_',
 '“',
 '”',
 '、',
 '。',
 '《',
 '》',
 '一',
 '一些',
 '一何',
 '一切',
 '一则',
 '一方面',
 '一旦',
 '一来',
 '一样',
 '一般',
 '一转眼',
 '万一',
 '上',
 '上下',
 '下',
 '不',
 '不仅',
 '不但',
 '不光',
 '不单',
 '不只',
 '不外乎',
 '不如',
 '不妨',
 '不尽',
 '不尽然',
 '不得',
 '不怕',
 '不惟',
 '不成',
 '不拘',
 '不料',
 '不是',
 '不比',
 '不然',
 '不特',
 '不独',
 '不管',
 '不至于',
 '不若',
 '不论',
 '不过',
 '不问',
 '与',
 '与其',
 '与其说',
 '与否',
 '与此同时',
 '且',
 '且不说',
 '且说',
 '两者',
 '个',
 '个别',
 '临',
 '为',
 '为了',
 '为什么',
 '为何',
 '为止',
 '为此',
 '为着',
 '乃',
 '乃至',
 '乃至于',
 '么',
 '之',
 '之一',
 '之所以',
 '之类',
 '乌乎',
 '乎',
 '乘',
 '也',
 '也好',
 '也罢',
 '了',
 '二来',
 '于',
 '于是',
 '于是乎',
 '云云',
 '云尔',
 '些',
 '亦',
 '人',
 '人们',
 '人家',
 '什么',
 '什么样',
 '今',
 '介于',
 '仍',
 '仍旧',
 '从',
 '从此',
 '从而',
 '他',
 '他人',
 '他们',
 '以',
 '以上',
 '以为',
 '以便',
 '以免',
 '以及',
 '以故',
 '以期',
 '以来',
 '以至',
 '以至于',
 '以致',
 '们',
 '任',
 '任何',
 '任凭',
 '似的',
 '但',
 '但凡',
 '但是',
 '何',
 '何以',
 '何况',
 '何处',
 '何时',
 '余外',
 '作为',
 '你',
 '你们'

In [59]:
content_cut_list = list()
for content in love_cont["content"]:
    content_cut = jieba.lcut(content)
    content_cut = [cont for cont in content_cut if cont not in stop_words]
    content_cut_list.append(" ".join(content_cut))
love_cont["content_cut"] = content_cut_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [67]:
tfidf_vec = TfidfVectorizer()
tfidf_mat = tfidf_vec.fit_transform(love_cont["content_cut"])

In [71]:
vocab_dict

{'另一边': 984,
 '一个': 103,
 'u2': 87,
 '号子': 1009,
 '练习': 3188,
 'b1': 18,
 '压腿': 915,
 '眼睛': 2987,
 '时不时': 2319,
 '一眼': 188,
 '离开': 3069,
 '西红柿': 3447,
 '幸福': 1657,
 '背影': 3275,
 'f1': 32,
 '下车': 264,
 'o2': 69,
 '赶紧': 3570,
 '帮忙': 1626,
 '你别': 560,
 'd1': 25,
 '伸出': 536,
 '起床号': 3572,
 '递给': 3762,
 '一块儿': 132,
 '糖果': 3159,
 '搓手': 2187,
 '握住': 2180,
 '取暖': 962,
 '我爸': 1910,
 '是不是': 2334,
 '特瘦': 2797,
 '头发': 1366,
 '点头': 2716,
 '哭泣': 1120,
 '快关': 1780,
 '十年': 874,
 '记不得': 3479,
 '他长': 496,
 '正在': 2496,
 '练舞': 3189,
 'c1': 20,
 '几个': 744,
 '不想': 297,
 '这有': 3714,
 '小林': 1549,
 '其实': 684,
 '一直': 186,
 '现在': 2820,
 '喜欢': 1139,
 '以前': 503,
 '影响': 1719,
 '进步': 3729,
 '昨天': 2332,
 '预备期': 3963,
 '顺利': 3958,
 '我头': 1899,
 '一次': 175,
 '遇见': 3774,
 '一条': 168,
 '大河': 1322,
 '手里': 1962,
 '毛线': 2543,
 '很长': 1741,
 '折返': 2028,
 '回来': 1182,
 '上前': 231,
 '几步': 756,
 '一把': 154,
 '怀里': 1795,
 '听到': 1063,
 '觉得': 3462,
 '本能': 2398,
 '起身': 3578,
 '面前': 3938,
 '我能': 1915,
 '点儿': 2715,
 '仰头': 508,
 '冲着': 722,


In [73]:
feat

array([[  87,  984,  915, ..., 1362, 1349, 4049],
       [  87, 3447, 1657, ..., 1358, 1345, 4049],
       [1626,  560,  264, ..., 1359, 1346, 4049],
       ...,
       [3106, 3966, 3658, ..., 1363, 1350, 4049],
       [3106, 3966, 3658, ..., 1363, 1350, 4049],
       [3106, 3966, 3658, ..., 1363, 1350, 4049]], dtype=int64)

In [70]:
vocab_dict =tfidf_vec.vocabulary_#获得所有文本的关键字和其位置的dict
weight = tfidf_mat.toarray()
feat = np.argsort(-weight)#降序排序
total_key_word = []
for l in range(len(love_cont["content_cut"])):
    values_word = []
    for j in range(5):#获取每类文本的5个关键字
        values_word.append([k for k,v in vocab_dict.items() if v ==feat[l,j]])
    total_key_word.append(values_word)

In [74]:
total_key_word

[[['u2'], ['另一边'], ['压腿'], ['练习'], ['号子']],
 [['u2'], ['西红柿'], ['幸福'], ['背影'], ['离开']],
 [['帮忙'], ['你别'], ['下车'], ['赶紧'], ['f1']],
 [['伸出'], ['下车'], ['u2'], ['d1'], ['b1']],
 [['伸出'], ['下车'], ['u2'], ['d1'], ['b1']],
 [['起床号'], ['b1'], ['11'], ['漫画'], ['漱口']],
 [['u2'], ['取暖'], ['b1'], ['搓手'], ['糖果']],
 [['u2'], ['取暖'], ['b1'], ['搓手'], ['糖果']],
 [['记不得'], ['快关'], ['十年'], ['他长'], ['特瘦']],
 [['练舞'], ['糖果'], ['几个'], ['正在'], ['递给']],
 [['不想'], ['离开'], ['o2'], ['11'], ['漱口']],
 [['这有'], ['o2'], ['11'], ['漱口'], ['潜伏']],
 [['小林'], ['其实'], ['一直'], ['o2'], ['11']],
 [['现在'], ['o2'], ['11'], ['漱口'], ['潜伏']],
 [['喜欢'], ['我头'], ['顺利'], ['预备期'], ['进步']],
 [['毛线'], ['折返'], ['很长'], ['几步'], ['握住']],
 [['怀里'], ['一把'], ['f1'], ['o2'], ['漱口']],
 [['听到'], ['觉得'], ['o2'], ['11'], ['潜伏']],
 [['本能'], ['我能'], ['点儿'], ['起身'], ['面前']],
 [['c1'], ['11'], ['漱口'], ['潜伏'], ['潜力']],
 [['仰头'], ['冲着'], ['大吼'], ['宿舍'], ['明天']],
 [['一个'], ['人送'], ['来到'], ['门口'], ['没有']],
 [['标准'], ['姿势'], ['敬礼'], ['c1'], ['11']],
 [['目送

In [11]:
love_cont = train_raw[train_raw["love"] > 0]
love_cont.head(10)

Unnamed: 0,index,id,content,character,emotions,script,scene,sent_id,script_scene,love,joy,fright,anger,fear,sorrow
39,39,1171_0003_A_45,另一边，一个叫u2的号子手还在练习着，b1还在压腿，眼睛时不时往u2那边看一眼。,b1,100000,1171,3,45,1171_0003,1.0,0.0,0.0,0.0,0.0,0.0
174,181,1171_0015_A_186,u2离开，b1吃了口西红柿，幸福的望着u2的背影。,b1,110000,1171,15,186,1171_0015,1.0,1.0,0.0,0.0,0.0,0.0
238,234,1171_0022_A_251,f1要下车，o2赶紧帮忙：你别跳，你脚上还有泡呢！,o2,100000,1171,22,251,1171_0022,1.0,0.0,0.0,0.0,0.0,0.0
240,240,1171_0022_A_253,u2下车，d1和b1同时伸出了手。,b1,100000,1171,22,253,1171_0022,1.0,0.0,0.0,0.0,0.0,0.0
241,241,1171_0022_A_254,u2下车，d1和b1同时伸出了手。,d1,100000,1171,22,254,1171_0022,1.0,0.0,0.0,0.0,0.0,0.0
330,331,1171_0029_A_344,b1：我想听你吹起床号。,b1,100000,1171,29,344,1171_0029,1.0,0.0,0.0,0.0,0.0,0.0
337,342,1171_0030_A_351,b1递给了u2一块儿糖果，u2搓了搓手然后握住b1的手让她取暖。,b1,130000,1171,30,351,1171_0030,1.0,3.0,0.0,0.0,0.0,0.0
338,338,1171_0030_A_352,b1递给了u2一块儿糖果，u2搓了搓手然后握住b1的手让她取暖。,u2,100000,1171,30,352,1171_0030,1.0,0.0,0.0,0.0,0.0,0.0
393,397,1171_0037_A_412,b1：我爸是不是特瘦？头发都白了吧！（o2点头，哭泣）他都快关了十年了，我都快记不得他长什么样了。,b1,330000,1171,37,412,1171_0037,3.0,3.0,0.0,0.0,0.0,0.0
397,400,1171_0037_A_416,b1拿着糖果递给了正在练舞的c1几个。,b1,100000,1171,37,416,1171_0037,1.0,0.0,0.0,0.0,0.0,0.0


In [39]:
train_raw.pivot_table(index=["sorrow"], aggfunc = lambda x: len(x.unique()))

Unnamed: 0_level_0,anger,character,content,emotions,fear,fright,id,index,joy,love,scene,script,script_scene,sent_id
sorrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,4.0,70,25097,85,4.0,4.0,31004,31004,4.0,4.0,174,31,3006,2239
1.0,4.0,62,2691,28,4.0,4.0,2821,2821,4.0,4.0,159,31,982,1506
2.0,4.0,56,1835,41,4.0,4.0,1983,1983,4.0,4.0,164,31,691,1239
3.0,4.0,47,890,21,4.0,4.0,974,974,4.0,3.0,134,28,301,775


In [40]:
print("love:", 35500 / train_raw.shape[0])
print("joy:", 34014 / train_raw.shape[0])
print("fright:", 34568 / train_raw.shape[0])
print("anger:", 33009 / train_raw.shape[0])
print("fear:", 33821 / train_raw.shape[0])
print("sorrow:", 31004 / train_raw.shape[0])

love: 0.8296330918438888
joy: 0.794905351717691
fright: 0.8078523019397056
anger: 0.7714185557373218
fear: 0.7903949520916101
sorrow: 0.7245618135078289


In [33]:
train_raw.shape[0] 

42790

In [24]:
train_raw[train_raw["emotions"]=='1,0,0,0,0,0'].head(10)

Unnamed: 0,index,id,content,character,emotions,script,scene,sent_id,script_scene
39,39,1171_0003_A_45,另一边，一个叫u2的号子手还在练习着，b1还在压腿，眼睛时不时往u2那边看一眼。,b1,100000,1171,3,45,1171_0003
238,234,1171_0022_A_251,f1要下车，o2赶紧帮忙：你别跳，你脚上还有泡呢！,o2,100000,1171,22,251,1171_0022
240,240,1171_0022_A_253,u2下车，d1和b1同时伸出了手。,b1,100000,1171,22,253,1171_0022
241,241,1171_0022_A_254,u2下车，d1和b1同时伸出了手。,d1,100000,1171,22,254,1171_0022
330,331,1171_0029_A_344,b1：我想听你吹起床号。,b1,100000,1171,29,344,1171_0029
338,338,1171_0030_A_352,b1递给了u2一块儿糖果，u2搓了搓手然后握住b1的手让她取暖。,u2,100000,1171,30,352,1171_0030
397,400,1171_0037_A_416,b1拿着糖果递给了正在练舞的c1几个。,b1,100000,1171,37,416,1171_0037
520,519,1171_0048_A_543,o2：我是不想离开这儿。,o2,100000,1171,48,543,1171_0048
526,531,1171_0048_A_549,o2：就等我们俩能像现在这样。,o2,100000,1171,48,549,1171_0048
532,533,1171_0048_A_555,f1离开，但o2手里拿着她的毛线，毛线拉了很长，她又折返回来，o2上前几步，趁还她毛线握住了...,o2,100000,1171,48,555,1171_0048


In [21]:
dt_sen_ay2 = train_raw.pivot_table(index=["script_scene"], values=["content"], aggfunc =[lambda x: len(x.unique()), "count"])
dt_sen_ay2.reset_index(inplace=True)
dt_sen_ay2

Unnamed: 0_level_0,script_scene,<lambda>,count
Unnamed: 0_level_1,Unnamed: 1_level_1,content,content
0,1171_0001,11,16
1,1171_0002,3,4
2,1171_0003,37,48
3,1171_0004,5,7
4,1171_0005,2,4
5,1171_0006,15,19
6,1171_0007,10,12
7,1171_0008,1,1
8,1171_0009,31,37
9,1171_0010,2,2


In [29]:
dt_sen_ay2[dt_sen_ay2["<lambda>"]["content"]<3]["count"].sum()

content    1547
dtype: int64

In [51]:
dt_sen_ay

Unnamed: 0,script,scene,content
0,1171,0001,11
1,1171,0002,3
2,1171,0003,37
3,1171,0004,5
4,1171,0005,2
5,1171,0006,15
6,1171,0007,10
7,1171,0008,1
8,1171,0009,31
9,1171,0010,2


In [17]:
dt_sen_ay = train_raw.pivot_table(index=["script_scene"], values=["content"], aggfunc = lambda x: len(x.unique()))
dt_sen_ay.reset_index(inplace=True)
print("dt_sen_ay:", dt_sen_ay.shape)
sum(dt_sen_ay["content"] >= 3) / dt_sen_ay.shape[0]

dt_sen_ay: (3375, 2)


0.7389629629629629

In [21]:
dt_sen_ay.shape

(3375, 2)

In [18]:
dt_sen_ay[dt_sen_ay["content"] < 3]

Unnamed: 0,script_scene,content
4,1171_0005,2
7,1171_0008,1
9,1171_0010,2
12,1171_0013,2
13,1171_0014,1
16,1171_0017,1
18,1171_0019,2
22,1171_0023,2
24,1171_0025,1
32,1171_0033,2


In [20]:
len(script_scene_list)

881

In [19]:
script_scene_list = list(set(dt_sen_ay[dt_sen_ay["content"] < 3]["script_scene"].tolist()))
script_scene_list

['34911_0123',
 '2369_0081',
 '34913_0071',
 '32845_0091',
 '32505_0010',
 '2930_0103',
 '34135_0021',
 '34135_0045',
 '34173_0126',
 '34314_0135',
 '34842_0037',
 '34899_0072',
 '34842_0030',
 '34313_0013',
 '34949_0174',
 '32845_0001',
 '34946_0087',
 '34949_0129',
 '34940_0039',
 '32505_0024',
 '2388_0035',
 '34135_0114',
 '2996_0061',
 '34911_0032',
 '2388_0043',
 '34314_0023',
 '2996_0096',
 '34940_0009',
 '2996_0121',
 '32899_0052',
 '34842_0004',
 '1171_0074',
 '32798_0016',
 '34313_0066',
 '34135_0106',
 '34313_0009',
 '34940_0016',
 '34899_0081',
 '2721_0020',
 '32505_0105',
 '32504_0007',
 '32505_0127',
 '34311_0037',
 '34527_0065',
 '34173_0045',
 '2369_0056',
 '34135_0145',
 '34313_0060',
 '34940_0094',
 '34842_0028',
 '34913_0127',
 '32845_0062',
 '32504_0106',
 '2721_0011',
 '34913_0118',
 '32845_0125',
 '32504_0118',
 '2721_0088',
 '32899_0060',
 '34314_0113',
 '34121_0046',
 '34949_0175',
 '34527_0111',
 '2388_0054',
 '32504_0027',
 '34311_0057',
 '32504_0081',
 '2996_0

In [30]:
less_sent_train = train_raw[train_raw["script_scene"].isin(script_scene_list)]

In [40]:
less_sent_train.shape

(1547, 7)

In [5]:
train_raw.loc[6390:6426]

Unnamed: 0,id,content,character,emotions,script,scene,sent_id,script_scene
6390,2930_0077_A_941,天桥上，五个人招摇走过，引得路人纷纷拿手机抢拍起来。,,,2930,77,941,2930_0077
6391,2930_0080_A_964,什么澳洲和牛塔、加拿大整龙虾，配菜看着放，久闻松露比较贵，多放！甜点我就不挑了，布丁奶酪冰淇...,,,2930,80,964,2930_0080
6392,2930_0081_A_971,桌上已经摆满了餐盘。,,,2930,81,971,2930_0081
6393,2930_0078_A_944,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,o2,0.0,2930,78,944,2930_0078
6394,2930_0078_A_945,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,r2,0.0,2930,78,945,2930_0078
6395,2930_0078_A_946,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,q2,0.0,2930,78,946,2930_0078
6396,2930_0078_A_947,m2、q2和r2试了假发挑婚纱各种状态。,m2,10000.0,2930,78,947,2930_0078
6397,2930_0078_A_948,m2、q2和r2试了假发挑婚纱各种状态。,r2,10000.0,2930,78,948,2930_0078
6398,2930_0078_A_949,m2、q2和r2试了假发挑婚纱各种状态。,q2,10000.0,2930,78,949,2930_0078
6399,2930_0079_A_950,p2拍了四个女人的各种婚纱照，严肃的，恶搞的，高大上的等等玩得不亦乐乎。,p2,0.0,2930,79,950,2930_0079


In [34]:
less_sent_train[less_sent_train["script_scene"]=="2930_0078"]

Unnamed: 0,id,content,character,emotions,script,scene,script_scene
6393,2930_0078_A_944,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,o2,0,2930,78,2930_0078
6394,2930_0078_A_945,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,r2,0,2930,78,2930_0078
6395,2930_0078_A_946,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,q2,0,2930,78,2930_0078
6396,2930_0078_A_947,m2、q2和r2试了假发挑婚纱各种状态。,m2,10000,2930,78,2930_0078
6397,2930_0078_A_948,m2、q2和r2试了假发挑婚纱各种状态。,r2,10000,2930,78,2930_0078
6398,2930_0078_A_949,m2、q2和r2试了假发挑婚纱各种状态。,q2,10000,2930,78,2930_0078
6406,2930_0078_A_942,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,m2,0,2930,78,2930_0078
6407,2930_0078_A_943,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,p2,0,2930,78,2930_0078


In [32]:
less_sent_train.describe()

Unnamed: 0,id,content,character,emotions,script,scene,script_scene
count,1547,1547,1223,1204,1547,1547,1547
unique,1547,1227,58,63,31,167,881
top,34940_0098_A_1077,/外五人路过婚纱店，m2r2不约而同回头走了进去，q2o2p2跟进。,a1,0,32505,75,2930_0078
freq,1,5,122,770,113,21,8


In [7]:
train_raw_na = train_raw[train_raw["emotions"].isna()]
train_raw_notna = train_raw[~train_raw["emotions"].isna()]

In [21]:
dt_sen_ay_na = train_raw_na.pivot_table(index=["script", "scene"], values=["content"], aggfunc = lambda x: len(x.unique()))
dt_sen_ay_na.reset_index(inplace=True)
print("dt_sen_ay_na:", dt_sen_ay_na.shape)
sum(dt_sen_ay_na["content"] > 2) / dt_sen_ay_na.shape[0]

dt_sen_ay_na: (1927, 3)


0.3549558899844318

In [22]:
dt_sen_ay_notna = train_raw_notna.pivot_table(index=["script", "scene"], values=["content"], aggfunc = lambda x: len(x.unique()))
dt_sen_ay_notna.reset_index(inplace=True)
print("dt_sen_ay_notna:", dt_sen_ay_na.shape)
sum(dt_sen_ay_notna["content"] > 2) / dt_sen_ay_notna.shape[0]

dt_sen_ay_notna: (1927, 3)


0.7165079365079365