In [1]:
!pip install jupyter-dash
!pip install --upgrade transformers
!pip install simpletransformers
import pandas as pd
import re
import numpy as np
import sklearn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting dash (from jupyter-dash)
  Downloading dash-2.10.2-py3-none-any.whl (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
Collecting retrying (from jupyter-dash)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting ansi2html (from jupyter-dash)
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting Werkzeug<2.3.0 (from dash->jupyter-dash)
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash->jupyter-dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash->jupyter-

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


In [2]:
summary_app = JupyterDash(__name__)

In [3]:
train_data_detection = pd.read_csv('dataset - humor short.csv', sep=',')
test  = pd.read_csv('tweets - short.csv', sep=',')

In [4]:
#train_data_detection = train_data_detection.iloc[:,0:3]
print('The data format used for humor detection:')
print(train_data_detection.head())

train_data_detection['humor'].value_counts()

generalized_data_columns = ['text', 'labels']
train_data_detection.columns = generalized_data_columns

The data format used for humor detection:
                                                text  humor
0  Joe biden rules out 2020 bid: 'guys, i'm not r...  False
1  Watch: darvish gave hitter whiplash with slow ...  False
2  What do you call a turtle without its shell? d...   True
3      5 reasons the 2016 election feels so personal  False
4  Pasco police shot mexican migrant from behind,...  False


In [5]:
def preprocess_tweet_text(text_data):
    # Remove URLs
    text_data = re.sub(r"http\S+|www\S+|https\S+", "", text_data, flags=re.MULTILINE)

    # Remove mentions and hashtags
    text_data = re.sub(r"@\w+|#\w+", "", text_data)

    # Remove emojis and other non-alphanumeric characters
    text_data = text_data.encode("ascii", "ignore").decode("utf-8")
    text_data = re.sub(r"[^\w\s]", "", text_data)

    # Remove extra whitespaces
    text_data = re.sub(r"\s+", " ", text_data).strip()

    return text_data

train_data_detection['text'] = train_data_detection['text'].apply(preprocess_tweet_text)
test['content'] = test['content'].apply(preprocess_tweet_text)

In [6]:
test_size_det_value = 0.20
rnd_state_det_value = 10
train_detection_df, valid_detection_df = train_test_split(train_data_detection, test_size=test_size_det_value, stratify=train_data_detection['labels'], random_state=rnd_state_det_value)

n_tr_epochs_val = 5
lrn_rate_val = 1e-5
man_speed_val = 42
model_det_args = ClassificationArgs(num_train_epochs=n_tr_epochs_val, learning_rate=lrn_rate_val, overwrite_output_dir=True, manual_seed=man_speed_val, silent=True)

model_det = ClassificationModel(model_type='roberta', model_name='roberta-base', use_cuda=False, num_labels=2, args=model_det_args)
model_det.train_model(train_detection_df)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should pr

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

(175, 0.2948809115828148)

In [7]:
result_detection, model_det_outputs, wrong_det_predictions = model_det.eval_model(valid_detection_df)
print('Metric results for humor detection:')
print(result_detection)

# Prediction, the F1 score
detection_predictions = []
for x in model_det_outputs:
    detection_predictions.append(np.argmax(x))

print('Humor detection F1 score = ')
print(round(f1_score(valid_detection_df['labels'], detection_predictions), 5))

# Calculate the result precision/sensitivity/accuracy
res_det = []
for key in result_detection.keys():
  res_det.append(result_detection[key])

TP_res = res_det[1]
TN_res = res_det[2]
FP_res = res_det[3]
FN_res = res_det[4]
precision_res = TP_res / (TP_res + FP_res)
recall_res = TP_res / (TP_res + FN_res)
accuracy_res = (TP_res + TN_res) / (TP_res + TN_res + FP_res + FN_res)
print(f'Precision for humor detection: {round(precision_res, 5)}')
print(f'Recall for humor detection: {round(recall_res, 5)}')
print(f'Accuracy for humor detection:: {round(accuracy_res, 5)}')

Metric results for humor detection:
{'mcc': 0.9146591207600471, 'tp': 34, 'tn': 33, 'fp': 2, 'fn': 1, 'auroc': 0.9910204081632652, 'auprc': 0.9911693254262379, 'eval_loss': 0.1846814176792072}
Humor detection F1 score = 
0.95775
Precision for humor detection: 0.94444
Recall for humor detection: 0.97143
Accuracy for humor detection:: 0.95714


In [8]:
test_det_predictions, raw_det_outputs = model_det.predict(test['content'].tolist())

content_res_values = test['content']
is_humor_res_values = test_det_predictions
predict_results = pd.DataFrame({'text': content_res_values, 'is_humor':is_humor_res_values})
predict_results.to_csv('final results.csv', index=False)

In [13]:
# Visual summary structure
summary_app.layout = html.Div(children=[
    html.H1(children='Humor Data Analysis'),
    html.Div(children=[
        html.H2(children='Training Data Overview'),
        html.Div(children=[
            html.H3(children='Distribution of Humor Data'),
            dcc.Graph(
                id='humor-distribution',
                figure={
                    'data': [
                        {
                            'x': train_data_detection[train_data_detection['labels'] == 0]['labels'],
                            'type': 'histogram',
                            'name': 'Not Humor'
                        },
                        {
                            'x': train_data_detection[train_data_detection['labels'] == 1]['labels'],
                            'type': 'histogram',
                            'name': 'Humor'
                        }
                    ],
                    'layout': {
                        'title': 'Distribution of Training Humor Data'
                    }
                }
            )
        ]),
        html.Div(children=[
            dcc.Graph(
                id='test-humor-distribution',
                figure={
                    'data': [
                        {
                            'x': predict_results[predict_results['is_humor'] == 0]['is_humor'],
                            'type': 'histogram',
                            'name': 'Not Humor'
                        },
                        {
                            'x': predict_results[predict_results['is_humor'] == 1]['is_humor'],
                            'type': 'histogram',
                            'name': 'Humor'
                        }
                    ],
                    'layout': {
                        'title': 'Results: Distribution of Testing Humor Data'
                    }
                }
            )
        ])
    ]),
])

summary_app.run_server(mode='inline')

Dash is running on http://127.0.0.1:8050/



INFO:dash.dash:Dash is running on http://127.0.0.1:8050/



<IPython.core.display.Javascript object>