In [91]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import csv
import urllib.request


In [92]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [110]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
emotion_task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
MODEL_EMOTION = f"cardiffnlp/twitter-roberta-base-{emotion_task}"


tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

model_emotion = AutoModelForSequenceClassification.from_pretrained(MODEL_EMOTION)
model_emotion.save_pretrained(MODEL)


Downloading:   0%|          | 0.00/779 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [111]:
def negative_score(text):
#     text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    output_emotions = model_emotion(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores_emotions = output_emotions[0][0].detach().numpy()
    scores = softmax(scores)
    scores_emotions = softmax(scores_emotions)
    return np.concatenate((scores, scores_emotions))

In [112]:
import pandas as pd

In [113]:
train_dataset = pd.read_csv('./train.csv')
test_dataset = pd.read_csv('./test.csv')

In [114]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [115]:
train_dataset['text'].isnull().sum()

0

In [116]:
negativity_score = train_dataset['text'].apply(negative_score)

In [118]:
negativity_score

0       [0.04231138, 0.5765478, 0.3811407, 0.11996135,...
1       [0.3379307, 0.635805, 0.026264267, 0.28351602,...
2       [0.13055776, 0.82776093, 0.04168123, 0.5969902...
3       [0.27099624, 0.6886033, 0.040400457, 0.2607433...
4       [0.55991334, 0.41471392, 0.025372643, 0.153450...
                              ...                        
7608    [0.3346446, 0.6204224, 0.04493301, 0.12285043,...
7609    [0.8883374, 0.10528578, 0.0063767442, 0.393966...
7610    [0.090089135, 0.8770295, 0.032881506, 0.294406...
7611    [0.596613, 0.36611944, 0.037267584, 0.13641186...
7612    [0.40408504, 0.57197434, 0.023940641, 0.184390...
Name: text, Length: 7613, dtype: object

In [119]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

In [120]:
X = np.vstack(negativity_score)
Y = np.vstack(train_dataset['target'])
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=0)
print((X.shape, Y.shape))

((7613, 7), (7613, 1))


In [121]:
%%time

xgb_model = xgb.XGBClassifier(max_depth=4,
                        subsample=0.9,
                        objective='binary:logistic',
                        n_estimators=200,
                        learning_rate = 0.01)
eval_set = [(train_X, train_Y), (test_X, test_Y)]
xgb_model.fit(train_X, train_Y.ravel(), early_stopping_rounds=10, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)



[0]	validation_0-error:0.28594	validation_0-logloss:0.69090	validation_1-error:0.27165	validation_1-logloss:0.69081
[1]	validation_0-error:0.28142	validation_0-logloss:0.68873	validation_1-error:0.27559	validation_1-logloss:0.68853
[2]	validation_0-error:0.28113	validation_0-logloss:0.68658	validation_1-error:0.27559	validation_1-logloss:0.68624
[3]	validation_0-error:0.27923	validation_0-logloss:0.68450	validation_1-error:0.26772	validation_1-logloss:0.68405
[4]	validation_0-error:0.27952	validation_0-logloss:0.68244	validation_1-error:0.26509	validation_1-logloss:0.68190
[5]	validation_0-error:0.27894	validation_0-logloss:0.68036	validation_1-error:0.26378	validation_1-logloss:0.67975
[6]	validation_0-error:0.28186	validation_0-logloss:0.67834	validation_1-error:0.26116	validation_1-logloss:0.67762
[7]	validation_0-error:0.28054	validation_0-logloss:0.67637	validation_1-error:0.26116	validation_1-logloss:0.67560
[8]	validation_0-error:0.28054	validation_0-logloss:0.67445	validation_1

  return f(*args, **kwargs)


[13]	validation_0-error:0.27894	validation_0-logloss:0.66517	validation_1-error:0.26247	validation_1-logloss:0.66385
[14]	validation_0-error:0.27981	validation_0-logloss:0.66341	validation_1-error:0.26247	validation_1-logloss:0.66199
[15]	validation_0-error:0.28040	validation_0-logloss:0.66166	validation_1-error:0.26116	validation_1-logloss:0.66019
[16]	validation_0-error:0.27894	validation_0-logloss:0.66002	validation_1-error:0.25984	validation_1-logloss:0.65850
[17]	validation_0-error:0.27864	validation_0-logloss:0.65831	validation_1-error:0.26116	validation_1-logloss:0.65675
[18]	validation_0-error:0.27981	validation_0-logloss:0.65673	validation_1-error:0.25984	validation_1-logloss:0.65510
[19]	validation_0-error:0.27981	validation_0-logloss:0.65517	validation_1-error:0.26116	validation_1-logloss:0.65346
[20]	validation_0-error:0.27923	validation_0-logloss:0.65359	validation_1-error:0.26247	validation_1-logloss:0.65178
[21]	validation_0-error:0.27981	validation_0-logloss:0.65204	val

[84]	validation_0-error:0.26828	validation_0-logloss:0.58808	validation_1-error:0.25722	validation_1-logloss:0.58239
[85]	validation_0-error:0.26799	validation_0-logloss:0.58735	validation_1-error:0.25591	validation_1-logloss:0.58161
[86]	validation_0-error:0.26784	validation_0-logloss:0.58668	validation_1-error:0.25459	validation_1-logloss:0.58098
[87]	validation_0-error:0.26799	validation_0-logloss:0.58609	validation_1-error:0.25591	validation_1-logloss:0.58037
[88]	validation_0-error:0.26814	validation_0-logloss:0.58544	validation_1-error:0.25722	validation_1-logloss:0.57962
[89]	validation_0-error:0.26784	validation_0-logloss:0.58474	validation_1-error:0.25722	validation_1-logloss:0.57890
[90]	validation_0-error:0.26697	validation_0-logloss:0.58411	validation_1-error:0.25722	validation_1-logloss:0.57832
[91]	validation_0-error:0.26697	validation_0-logloss:0.58355	validation_1-error:0.25722	validation_1-logloss:0.57771
[92]	validation_0-error:0.26726	validation_0-logloss:0.58289	val

[154]	validation_0-error:0.26201	validation_0-logloss:0.55545	validation_1-error:0.25459	validation_1-logloss:0.54959
[155]	validation_0-error:0.26215	validation_0-logloss:0.55514	validation_1-error:0.25459	validation_1-logloss:0.54931
[156]	validation_0-error:0.26171	validation_0-logloss:0.55478	validation_1-error:0.25459	validation_1-logloss:0.54907
[157]	validation_0-error:0.26142	validation_0-logloss:0.55448	validation_1-error:0.25591	validation_1-logloss:0.54878
[158]	validation_0-error:0.26025	validation_0-logloss:0.55413	validation_1-error:0.25591	validation_1-logloss:0.54858
[159]	validation_0-error:0.26055	validation_0-logloss:0.55381	validation_1-error:0.25459	validation_1-logloss:0.54829
[160]	validation_0-error:0.26084	validation_0-logloss:0.55346	validation_1-error:0.25328	validation_1-logloss:0.54792
[161]	validation_0-error:0.26069	validation_0-logloss:0.55322	validation_1-error:0.25591	validation_1-logloss:0.54773
[162]	validation_0-error:0.26128	validation_0-logloss:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [122]:
# make predictions for test data
y_pred = xgb_model.predict(test_X)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.54%


In [123]:
test_negativity_score = test_dataset['text'].apply(negative_score)

In [124]:
y_test_pred = xgb_model.predict(np.vstack(test_negativity_score))

In [125]:
y_test_pred

array([0, 1, 1, ..., 1, 1, 1])

In [126]:
y_test_id = test_dataset['id']

In [127]:
data = [y_test_id, pd.Series(y_test_pred)]
headers = ["id", "target"]
submission = pd.concat(data, axis=1, keys=headers)

In [128]:
submission.to_csv('submission.csv', index=False)

#### 