In [54]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import csv
import urllib.request


In [55]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [56]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [73]:
def negative_score(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

In [74]:
import pandas as pd

In [75]:
train_dataset = pd.read_csv('./train.csv')
test_dataset = pd.read_csv('./test.csv')

In [76]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [38]:
train_dataset['text'].isnull().sum()

0

In [79]:
negativity_score = train_dataset['text'].apply(negative_score)

In [80]:
negativity_score

0          [0.04231138, 0.5765478, 0.3811407]
1          [0.3379307, 0.635805, 0.026264267]
2        [0.13055776, 0.82776093, 0.04168123]
3        [0.27099624, 0.6886033, 0.040400457]
4       [0.55991334, 0.41471392, 0.025372643]
                        ...                  
7608     [0.38264447, 0.57848406, 0.03887146]
7609    [0.91048276, 0.08377866, 0.005738619]
7610     [0.07550905, 0.8929527, 0.031538185]
7611      [0.596613, 0.36611944, 0.037267584]
7612     [0.43497035, 0.5436898, 0.021339823]
Name: text, Length: 7613, dtype: object

In [81]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

In [82]:
X = np.vstack(negativity_score)
Y = np.vstack(train_dataset['target'])
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=0)
print((X.shape, Y.shape))

((7613, 3), (7613, 1))


In [83]:
%%time

xgb_model = xgb.XGBClassifier(max_depth=4,
                        subsample=0.9,
                        objective='binary:logistic',
                        n_estimators=200,
                        learning_rate = 0.01)
eval_set = [(train_X, train_Y), (test_X, test_Y)]
xgb_model.fit(train_X, train_Y.ravel(), early_stopping_rounds=10, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)



[0]	validation_0-error:0.33090	validation_0-logloss:0.69150	validation_1-error:0.33333	validation_1-logloss:0.69159
[1]	validation_0-error:0.31134	validation_0-logloss:0.68979	validation_1-error:0.31365	validation_1-logloss:0.68995
[2]	validation_0-error:0.31047	validation_0-logloss:0.68820	validation_1-error:0.31365	validation_1-logloss:0.68842
[3]	validation_0-error:0.30258	validation_0-logloss:0.68659	validation_1-error:0.31759	validation_1-logloss:0.68699
[4]	validation_0-error:0.32156	validation_0-logloss:0.68505	validation_1-error:0.32021	validation_1-logloss:0.68552
[5]	validation_0-error:0.31951	validation_0-logloss:0.68346	validation_1-error:0.33202	validation_1-logloss:0.68414
[6]	validation_0-error:0.31981	validation_0-logloss:0.68192	validation_1-error:0.33202	validation_1-logloss:0.68282
[7]	validation_0-error:0.30171	validation_0-logloss:0.68039	validation_1-error:0.31627	validation_1-logloss:0.68146
[8]	validation_0-error:0.31557	validation_0-logloss:0.67895	validation_1

  return f(*args, **kwargs)


[17]	validation_0-error:0.30185	validation_0-logloss:0.66647	validation_1-error:0.31234	validation_1-logloss:0.66831
[18]	validation_0-error:0.30579	validation_0-logloss:0.66510	validation_1-error:0.31102	validation_1-logloss:0.66707
[19]	validation_0-error:0.30185	validation_0-logloss:0.66396	validation_1-error:0.31234	validation_1-logloss:0.66597
[20]	validation_0-error:0.30273	validation_0-logloss:0.66280	validation_1-error:0.30446	validation_1-logloss:0.66479
[21]	validation_0-error:0.30258	validation_0-logloss:0.66153	validation_1-error:0.30052	validation_1-logloss:0.66354
[22]	validation_0-error:0.30054	validation_0-logloss:0.66034	validation_1-error:0.29921	validation_1-logloss:0.66240
[23]	validation_0-error:0.30244	validation_0-logloss:0.65905	validation_1-error:0.29790	validation_1-logloss:0.66120
[24]	validation_0-error:0.30039	validation_0-logloss:0.65788	validation_1-error:0.29396	validation_1-logloss:0.66017
[25]	validation_0-error:0.29850	validation_0-logloss:0.65673	val

[88]	validation_0-error:0.29310	validation_0-logloss:0.60854	validation_1-error:0.29528	validation_1-logloss:0.61410
[89]	validation_0-error:0.29368	validation_0-logloss:0.60808	validation_1-error:0.29528	validation_1-logloss:0.61367
[90]	validation_0-error:0.29310	validation_0-logloss:0.60760	validation_1-error:0.29134	validation_1-logloss:0.61321
[91]	validation_0-error:0.29295	validation_0-logloss:0.60710	validation_1-error:0.29396	validation_1-logloss:0.61274
[92]	validation_0-error:0.29295	validation_0-logloss:0.60670	validation_1-error:0.29134	validation_1-logloss:0.61239
[93]	validation_0-error:0.29295	validation_0-logloss:0.60629	validation_1-error:0.29003	validation_1-logloss:0.61201
[94]	validation_0-error:0.29295	validation_0-logloss:0.60577	validation_1-error:0.29134	validation_1-logloss:0.61156
[95]	validation_0-error:0.29295	validation_0-logloss:0.60536	validation_1-error:0.29134	validation_1-logloss:0.61122
[96]	validation_0-error:0.29280	validation_0-logloss:0.60497	val

[158]	validation_0-error:0.28989	validation_0-logloss:0.58414	validation_1-error:0.29396	validation_1-logloss:0.59176
[159]	validation_0-error:0.28989	validation_0-logloss:0.58393	validation_1-error:0.29396	validation_1-logloss:0.59159
[160]	validation_0-error:0.28974	validation_0-logloss:0.58370	validation_1-error:0.29396	validation_1-logloss:0.59135
[161]	validation_0-error:0.28945	validation_0-logloss:0.58345	validation_1-error:0.29396	validation_1-logloss:0.59115
[162]	validation_0-error:0.28959	validation_0-logloss:0.58325	validation_1-error:0.29396	validation_1-logloss:0.59094
[163]	validation_0-error:0.28930	validation_0-logloss:0.58300	validation_1-error:0.29396	validation_1-logloss:0.59071
[164]	validation_0-error:0.28915	validation_0-logloss:0.58274	validation_1-error:0.29396	validation_1-logloss:0.59046
[165]	validation_0-error:0.28901	validation_0-logloss:0.58250	validation_1-error:0.29396	validation_1-logloss:0.59034
[166]	validation_0-error:0.28915	validation_0-logloss:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [84]:
# make predictions for test data
y_pred = xgb_model.predict(test_X)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 70.47%


In [85]:
test_negativity_score = test_dataset['text'].apply(negative_score)

In [86]:
y_test_pred = xgb_model.predict(np.vstack(test_negativity_score))

In [87]:
y_test_pred

array([0, 0, 0, ..., 1, 1, 0])

In [88]:
y_test_id = test_dataset['id']

In [89]:
data = [y_test_id, pd.Series(y_test_pred)]
headers = ["id", "target"]
submission = pd.concat(data, axis=1, keys=headers)

In [90]:
submission.to_csv('submission.csv', index=False)