In [None]:
import pandas as pd
import ast
import torch
from tqdm import tqdm

In [None]:
characters = pd.read_csv('../data/IMDB/movie_characters_metadata.tsv',
                         sep='\t',
                         warn_bad_lines=False,
                         error_bad_lines=False,
                         header=None
                         )
conversations = pd.read_csv('../data/IMDB/movie_conversations.tsv',
                            sep='\t',
                            warn_bad_lines=False,
                            error_bad_lines=False,
                            header=None
                            )

lines = pd.read_csv('../data/IMDB/movie_lines.tsv',
                    sep='\t',
                    warn_bad_lines=False,
                    error_bad_lines=False,
                    header=None
                    )
titles = pd.read_csv('../data/IMDB/movie_titles_metadata.tsv',
                     sep='\t',
                     warn_bad_lines=False,
                     error_bad_lines=False,
                     header=None
                     )

characters.columns = ['characterID', 'character', 'movieID', 'movie_title',
                      'gender',
                      'position']
conversations.columns = ['characterID_1', 'characterID_2', 'movieID', 'chrono']
lines.columns = ['lineID', 'characterID', 'movieID', 'character', 'text']
titles.columns = ['movieID', 'movie_title', 'movie_year', 'rating', 'imdb_votes', 'genres']

lines.dropna(inplace=True)
conversations.dropna(inplace=True)
titles.dropna(inplace=True)
characters.dropna(inplace=True)

In [None]:
def chrn(s, delim=' '):
    s1 = ast.literal_eval(s)
    to_ret = [delim + i for i in s1[0].split(delim)[1:]]
    return to_ret

In [None]:
def gnr(s):
    to_ret = [i[1:-1] for i in s.strip('][').split(' ')]
    return to_ret

In [None]:
# Task 1. Data preparing

lns_chrs = dict(zip(lines.lineID, lines.characterID)) # Маппинг персонажей
lns = dict(zip(lines.lineID, lines.text)) # Маппинг текста
chars = dict(zip(characters.characterID, characters.character)) # Маппинг имен
floor = dict(zip(characters.characterID, characters.gender)) # Маппинг пола

titles['genres'] = titles['genres'].apply(lambda x: gnr(x))
titles_new = titles[['movieID', 'movie_title', 'genres']]

conversations['chrono'] = conversations['chrono'].apply(lambda x: chrn(x, delim='L'))
conversations['characters'] = conversations['chrono'].apply(lambda x: [lns_chrs[i] if i \
                                                                       in lns_chrs.keys() else 'unknown_character' for i in x])

conversations['genders'] = conversations['characters'].apply(lambda x: [floor[i] if i \
                                                                        in floor.keys() else '?' for i in x])

conversations['replics'] = conversations['chrono'].apply(lambda x: [lns[i] if i \
                                                                    in lns.keys() else 'unknown_text'for i in x])

conversations['names'] = conversations['characters'].apply(lambda x: [chars[i] if i \
                                                                      in chars.keys() else 'unknown_character 'for i in x])

conversations_new = conversations[['movieID', 'chrono', 'characters', 
                                   'genders', 'replics', 'names']]



df = conversations_new.merge(titles_new, on='movieID', how='left')
#df.to_csv('films_prepared_v3.csv', index=False)

In [None]:
#df = pd.read_csv('films_prepared_v3.csv')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

In [None]:
df.replics = df.replics.apply(lambda x: eval(x))
df.names = df.names.apply(lambda x: eval(x))

In [None]:
sample = df.iloc[5527]
sample

In [None]:
def convert_to_dataset_torch(data: list):
    input_ids = []
    attention_masks = []
    
    for row in data:
        encoded_dict = tokenizer.encode_plus(row, 
                                             max_length=130,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, 
                                             return_tensors='pt', 
                                             truncation=True)

        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])
        
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    input_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    
    return input_ids, attention_masks

In [None]:
converted = convert_to_dataset_torch(sample.replics)

In [None]:
preds = model(input_ids=converted[0], 
      attention_mask=converted[1]).logits
                                    

In [None]:
preds

In [None]:
labels = [
    'Positive',
    'Negative',
    'Neutral'
]

In [None]:
mapping = {i: k for i, k in enumerate(labels)}

In [None]:
mapping

In [None]:
predictions = torch.argmax(torch.nn.Softmax()(preds), axis=1)

In [None]:
df_local = pd.DataFrame({'Speaker': sample.names,
                         'Text': sample.replics,
                         'Emotion': list(map(lambda x: mapping.get(x, None), predictions.numpy()))})

In [None]:
df_local

In [None]:
emotion = df_local.groupby('Speaker')['Emotion'].value_counts(normalize=True)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()

total_axes = df_local.Speaker.nunique()
subplt = 1 # номер начального axes
for x in emotion.index.get_level_values('Speaker').unique():
    ax = fig.add_subplot(1, total_axes, subplt) # добавляем axes для каждого графика
    plt.pie(emotion[x], labels=emotion[x].index.tolist(), autopct='%1.1f%%')
    plt.xlabel(x)
    subplt+=1

plt.tight_layout()
plt.show()