In [3]:
import pandas as pd

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Celebrating my promotion 😎"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Celebrating my promotion 😎"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")



2022-12-28 19:04:52.126510: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-28 19:04:52.424488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-28 19:04:52.424530: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-28 19:04:53.569080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

1) joy 0.9382
2) optimism 0.0362
3) anger 0.0145
4) sadness 0.0112


In [2]:
def generate_sentiment(msg):
#     text = "Celebrating my promotion 😎"
    text = preprocess(msg)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # # TF
    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
    # model.save_pretrained(MODEL)

    # text = "Celebrating my promotion 😎"
    # encoded_input = tokenizer(text, return_tensors='tf')
    # output = model(encoded_input)
    # scores = output[0][0].numpy()
    # scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        # print(f"{i+1}) {l} {np.round(float(s), 4)}")
        break

    return l, np.round(float(s), 4)

In [6]:
df_int =pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/reddit_dataset_with_CSSR_intensity.csv')

In [7]:
df_int

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,processed,class,category,suicide_intensity,intensity
0,0,0,ex wife threatening suiciderecently left wife ...,suicide,1,2,Ideation
1,1,1,weird get affected compliment coming someone k...,non-suicide,0,2,Ideation
2,2,2,finally almost never hear ha bad year ever swe...,non-suicide,0,2,Ideation
3,3,3,need helpjust help im cry hard,suicide,1,2,Ideation
4,4,4,losthello name adam struggling year afraid pas...,suicide,1,2,Ideation
...,...,...,...,...,...,...,...
231991,232069,232069,like rock going get anything go http musictast...,non-suicide,0,2,Ideation
231992,232070,232070,tell many friend lonely everything deprived pr...,non-suicide,0,2,Ideation
231993,232071,232071,pee probably taste like salty tea someone dran...,non-suicide,0,2,Ideation
231994,232072,232072,usual stuff find herei posting sympathy pity k...,suicide,1,2,Ideation


In [8]:
import re
def remove_characters(text):
    text = text.split()
    text = [x for x in text if len(x)>3]
    return ' '.join(text)
df_int['processed'] = df_int.processed.apply(lambda x: remove_characters(x))

In [9]:
df_int = df_int.astype('str')
df_int =df_int.dropna()

In [10]:
def filter_doc(text):
    text = text.split()
    if len(text) < 500:
        return ' '.join(text)
    else: 
        text = text[0:480]
        return ' '.join(text)

In [11]:
sent, score = generate_sentiment(df.text.values[0])

In [14]:
generate_sentiment(filter_doc(df_int['processed'].iloc[134]))

('sadness', 0.6941)

In [16]:
df_int['emotion'] = np.nan
df_int['score'] = np.nan

In [18]:
i=0
for row, col in df_int.iterrows():
    try:
        sentiment, score = generate_sentiment(filter_doc(col['processed']))
        df_int['emotion'].iloc[i] = sentiment
        df_int['score'].iloc[i] = score
#     print(i,"-",sentiment,"-",score)
    except (RuntimeError, IndexError) as error: 
        print(',', i)
    i+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


, 520
, 994
, 1118
, 1235
, 1898
, 2070
, 2795
, 4139
, 5227
, 5383
, 5568
, 6068
, 6228
, 6853
, 7189
, 7220
, 7323
, 7333
, 8009
, 8435
, 8437
, 8661
, 8783
, 9241
, 9379
, 9825
, 9919
, 10473
, 10969
, 11550
, 12152
, 13363
, 13460
, 13682
, 13811
, 14549
, 15582
, 15737
, 15880
, 16127
, 16700
, 16980
, 17373
, 17885
, 18053
, 18668
, 18699
, 18770
, 19384
, 19489
, 19496
, 19628
, 20257
, 20770
, 21400
, 22484
, 22879
, 22920
, 23620
, 23635
, 25070
, 25534
, 26392
, 26811
, 27433
, 27456
, 28110
, 28269
, 28343
, 28520
, 28826
, 29363
, 29585
, 30468
, 30591
, 30772
, 30853
, 31027
, 31534
, 31672
, 31769
, 31962
, 32274
, 32351
, 32362
, 33636
, 33882
, 34360
, 34393
, 34420
, 34612
, 35258
, 35651
, 35672
, 37218
, 37458
, 37812
, 37900
, 38570
, 38578
, 38750
, 39470
, 40266
, 40294
, 40353
, 40354
, 40449
, 40516
, 40629
, 41527
, 41839
, 41987
, 42247
, 42384
, 43228
, 43919
, 44370
, 44445
, 44581
, 44628
, 44953
, 45157
, 45890
, 46400
, 46734
, 47036
, 47086
, 47176
, 476

In [19]:
df_int

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,processed,class,category,suicide_intensity,intensity,emotion,score
0,0,0,wife threatening suiciderecently left wife goo...,suicide,1,2,Ideation,sadness,0.9002
1,1,1,weird affected compliment coming someone know ...,non-suicide,0,2,Ideation,sadness,0.4589
2,2,2,finally almost never hear year ever swear fuck...,non-suicide,0,2,Ideation,anger,0.9841
3,3,3,need helpjust help hard,suicide,1,2,Ideation,sadness,0.9469
4,4,4,losthello name adam struggling year afraid pas...,suicide,1,2,Ideation,sadness,0.9653
...,...,...,...,...,...,...,...,...,...
231991,232069,232069,like rock going anything http musictaste space...,non-suicide,0,2,Ideation,joy,0.8891
231992,232070,232070,tell many friend lonely everything deprived bo...,non-suicide,0,2,Ideation,sadness,0.9739
231993,232071,232071,probably taste like salty someone drank confirm,non-suicide,0,2,Ideation,joy,0.5339
231994,232072,232072,usual stuff find herei posting sympathy pity k...,suicide,1,2,Ideation,sadness,0.5559


In [20]:
df_int.to_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/Twitter_dataset_with_CSSR_reddit_intensity_feelings.csv')

In [26]:
df_int

Unnamed: 0,processed,class,category,suicide_intensity,intensity,emotion,score
0,wife threatening suiciderecently left wife goo...,suicide,1,2,Ideation,sadness,0.9002
1,weird affected compliment coming someone know ...,non-suicide,0,2,Ideation,sadness,0.4589
2,finally almost never hear year ever swear fuck...,non-suicide,0,2,Ideation,anger,0.9841
3,need helpjust help hard,suicide,1,2,Ideation,sadness,0.9469
4,losthello name adam struggling year afraid pas...,suicide,1,2,Ideation,sadness,0.9653
...,...,...,...,...,...,...,...
231991,like rock going anything http musictaste space...,non-suicide,0,2,Ideation,joy,0.8891
231992,tell many friend lonely everything deprived bo...,non-suicide,0,2,Ideation,sadness,0.9739
231993,probably taste like salty someone drank confirm,non-suicide,0,2,Ideation,joy,0.5339
231994,usual stuff find herei posting sympathy pity k...,suicide,1,2,Ideation,sadness,0.5559


In [25]:
df_int = df_int[['processed', 'class', 'category',
       'suicide_intensity', 'intensity', 'emotion', 'score']]

In [43]:
def f(x):
    return x['score']/x['emotion']

def process_data(dataset):
    res = dataset.groupby(['class', 'intensity', 'emotion']) \
    ['emotion', 'score'].agg({'emotion': 'count', 'score': 'sum'})
    res['confidence'] = res[['emotion','score']].apply(lambda x: f(x), axis=1)
    return res

In [29]:
df_int.columns

Index(['processed', 'class', 'category', 'suicide_intensity', 'intensity',
       'emotion', 'score'],
      dtype='object')

In [44]:
d = process_data(df_int)


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [34]:
d

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,emotion,score,confidence
class,intensity,emotion,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
non-suicide,Attempt,anger,1,0.8813,0.8813
non-suicide,Attempt,joy,1,0.4237,0.4237
non-suicide,Behavior,anger,30,21.6359,0.721197
non-suicide,Behavior,joy,22,16.7319,0.760541
non-suicide,Behavior,optimism,3,1.6939,0.564633
non-suicide,Behavior,sadness,19,14.1007,0.742142
non-suicide,Ideation,anger,31647,24442.1169,0.772336
non-suicide,Ideation,joy,39844,29836.1809,0.748825
non-suicide,Ideation,optimism,2767,1501.8298,0.542765
non-suicide,Ideation,sadness,38509,28761.3948,0.746875


In [35]:
d.to_csv('/home/ifte-home/Downloads/out_emotion.csv')

In [6]:
import plotly.express as px  

In [46]:
import pandas as pd
df = pd.read_csv('/home/ifte-home/Downloads/out_emotion.csv')

df['confidence'] = df.confidence.apply(lambda x: x*100)


In [49]:
df.head(3)

Unnamed: 0,class,intensity,emotion,count,score,confidence,logscale,#NAME?,Unnamed: 8,logscale.1
0,non-suicide,Attempt,anger,2,0.8813,88.13,1.0,0.30103,-6.578813,6.578813
1,non-suicide,Attempt,sadness,1,0.4237,42.37,0.0,0.0,0.0,0.0
2,non-suicide,Behavior,anger,20,21.6359,72.119667,4.321928,1.30103,-28.433159,28.433159


In [47]:
df['class'].unique().tolist()

['non-suicide', 'suicide']

In [51]:
def create_sunburst(result):
    fig = px.sunburst(result, path=['class', 'intensity', 'emotion'], 
                      values='logscale.1', color='intensity')
    fig.update_traces(textinfo="label+percent parent")
    fig.update_layout(title_text="Two-level Sunburst Diagram", font_size=8)
    fig.show()

create_sunburst(df)