In [6]:
!pip install jupyter-dash
!pip install --upgrade transformers
!pip install simpletransformers
import pandas as pd
import re
import numpy as np
import sklearn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
summary_app = JupyterDash(__name__)

In [8]:
train_data_scale_recogn = pd.read_csv('train - short.csv', sep=',')
test  = pd.read_csv('test - short.csv', sep=',')

In [9]:
train_data_scale_recogn.drop(['is_humor','offense_rating','humor_controversy'], axis=1, inplace=True)
print('The data format used for humor scale recognition:')
print(train_data_scale_recogn.head())

train_data_scale_recogn['humor_rating'].fillna(0, inplace = True)
train_data_scale_recogn.isnull().sum()

generalized_data_columns = ['id','text', 'labels']
train_data_scale_recogn.columns = generalized_data_columns

The data format used for humor scale recognition:
   id                                               text  humor_rating
0   1  TENNESSEE: We're the best state. Nobody even c...          2.42
1   2  A man inserted an advertisement in the classif...          2.50
2   3  How many men does it take to open a can of bee...          1.95
3   4  Told my mom I hit 1200 Twitter followers. She ...          2.11
4   5  Roses are dead. Love is fake. Weddings are bas...          2.78


In [10]:
def preprocess_tweet_text(text_data):
    # Remove URLs
    text_data = re.sub(r"http\S+|www\S+|https\S+", "", text_data, flags=re.MULTILINE)

    # Remove mentions and hashtags
    text_data = re.sub(r"@\w+|#\w+", "", text_data)

    # Remove emojis and other non-alphanumeric characters
    text_data = text_data.encode("ascii", "ignore").decode("utf-8")
    text_data = re.sub(r"[^\w\s]", "", text_data)

    # Remove extra whitespaces
    text_data = re.sub(r"\s+", " ", text_data).strip()

    return text_data

train_data_scale_recogn['text'] = train_data_scale_recogn['text'].apply(preprocess_tweet_text)
test['text'] = test['text'].apply(preprocess_tweet_text)

In [11]:
test_size_sc_rec_value = 0.01
rnd_state_sc_rec_value = 10
train_sc_recogn_df, valid_sc_recogn_df = train_test_split(train_data_scale_recogn, test_size=test_size_sc_rec_value, random_state=rnd_state_sc_rec_value)

lrn_rate_val = 1e-5
n_tr_epochs_val = 1
man_speed_val = 17
model_sc_rec_args = ClassificationArgs(num_train_epochs=n_tr_epochs_val, learning_rate=lrn_rate_val, overwrite_output_dir=True,regression=True, manual_seed=man_speed_val, silent=True)

model_sc_rec = ClassificationModel(model_type='roberta', model_name='roberta-large', use_cuda=False, num_labels=1, args=model_sc_rec_args)
model_sc_rec.train_model(train_sc_recogn_df)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should 

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

(14, 3.1283397738422667)

In [12]:
result_sc_rec, model_sc_rec_outputs, wrong_sc_rec_predictions = model_sc_rec.eval_model(valid_sc_recogn_df)
print('Metric results for humor scale recognition:')
print(result_sc_rec)

Metric results for humor scale recognition:
{'eval_loss': 2.4195499420166016}


In [13]:
test_sc_rec_predictions, raw_sc_rec_outputs = model_sc_rec.predict(test['text'].tolist())

id_res_values = test['id']
humor_rate_res_values = test_sc_rec_predictions
predict_results = pd.DataFrame({'id': id_res_values, 'humor_rate':humor_rate_res_values})
predict_results.to_csv('final results.csv', index=False)

In [15]:
# Visual summary structure
summary_app.layout = html.Div(children=[
    html.H1(children='Humor Data Analysis'),
    html.Div(children=[
        html.H2(children='Training Data Overview'),
        html.Div(children=[
            html.H3(children='Summary of Humor Ratings'),
            dcc.Graph(
                id='humor-ratings-summary',
                figure={
                    'data': [
                        {
                            'x': train_data_scale_recogn['labels'],
                            'type': 'histogram',
                            'name': 'Humor Ratings'
                        }
                    ],
                    'layout': {
                        'title': 'Summary of Humor Ratings',
                        'xaxis': {'title': 'Humor Rating'},
                        'yaxis': {'title': 'Count'}
                    }
                }
            )
        ])
    ]),
    html.Div(children=[
        html.H2(children='Testing Data Overview'),
        html.Div(children=[
            html.H3(children='Summary of Result Humor Ratings'),
            dcc.Graph(
                id='test-humor-ratings-summary',
                figure={
                    'data': [
                        {
                            'x': test_sc_rec_predictions,
                            'type': 'histogram',
                            'name': 'Humor Ratings'
                        }
                    ],
                    'layout': {
                        'title': 'Summary of Humor Ratings',
                        'xaxis': {'title': 'Humor Rating'},
                        'yaxis': {'title': 'Count'}
                    }
                }
            )
        ])
    ]),
])

summary_app.run_server(mode='inline')

Dash is running on http://127.0.0.1:8050/



INFO:dash.dash:Dash is running on http://127.0.0.1:8050/



<IPython.core.display.Javascript object>