In [54]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import csv
import urllib.request


In [55]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [56]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [57]:
def negative_score(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores[2]

In [35]:
import pandas as pd

In [36]:
train_dataset = pd.read_csv('./train.csv')
test_dataset = pd.read_csv('./test.csv')

In [37]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [38]:
train_dataset['text'].isnull().sum()

0

In [39]:
negativity_score = train_dataset['text'].apply(negative_score)

In [44]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

In [48]:
X = np.vstack(negativity_score)
Y = np.vstack(train_dataset['target'])
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=0)
print((X.shape, Y.shape))

((7613, 1), (7613, 1))


In [60]:
%%time

xgb_model = xgb.XGBClassifier(max_depth=4,
                        subsample=0.9,
                        objective='binary:logistic',
                        n_estimators=200,
                        learning_rate = 0.01)
eval_set = [(train_X, train_Y), (test_X, test_Y)]
xgb_model.fit(train_X, train_Y.ravel(), early_stopping_rounds=10, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)



[0]	validation_0-error:0.35630	validation_0-logloss:0.69190	validation_1-error:0.35958	validation_1-logloss:0.69197
[1]	validation_0-error:0.35630	validation_0-logloss:0.69071	validation_1-error:0.35958	validation_1-logloss:0.69084
[2]	validation_0-error:0.35644	validation_0-logloss:0.68951	validation_1-error:0.36220	validation_1-logloss:0.68975
[3]	validation_0-error:0.35644	validation_0-logloss:0.68833	validation_1-error:0.36220	validation_1-logloss:0.68862
[4]	validation_0-error:0.35644	validation_0-logloss:0.68719	validation_1-error:0.36220	validation_1-logloss:0.68755
[5]	validation_0-error:0.35644	validation_0-logloss:0.68606	validation_1-error:0.36220	validation_1-logloss:0.68647
[6]	validation_0-error:0.35644	validation_0-logloss:0.68496	validation_1-error:0.36089	validation_1-logloss:0.68542
[7]	validation_0-error:0.35644	validation_0-logloss:0.68386	validation_1-error:0.35827	validation_1-logloss:0.68434
[8]	validation_0-error:0.35615	validation_0-logloss:0.68280	validation_1

  return f(*args, **kwargs)


[15]	validation_0-error:0.35615	validation_0-logloss:0.67593	validation_1-error:0.35827	validation_1-logloss:0.67680
[16]	validation_0-error:0.35352	validation_0-logloss:0.67503	validation_1-error:0.35958	validation_1-logloss:0.67584
[17]	validation_0-error:0.35615	validation_0-logloss:0.67412	validation_1-error:0.35827	validation_1-logloss:0.67495
[18]	validation_0-error:0.35615	validation_0-logloss:0.67325	validation_1-error:0.35827	validation_1-logloss:0.67411
[19]	validation_0-error:0.35615	validation_0-logloss:0.67239	validation_1-error:0.35827	validation_1-logloss:0.67329
[20]	validation_0-error:0.35615	validation_0-logloss:0.67153	validation_1-error:0.35827	validation_1-logloss:0.67243
[21]	validation_0-error:0.35615	validation_0-logloss:0.67071	validation_1-error:0.35827	validation_1-logloss:0.67168
[22]	validation_0-error:0.35615	validation_0-logloss:0.66990	validation_1-error:0.35827	validation_1-logloss:0.67088
[23]	validation_0-error:0.35615	validation_0-logloss:0.66907	val

[86]	validation_0-error:0.34798	validation_0-logloss:0.63831	validation_1-error:0.35039	validation_1-logloss:0.63996
[87]	validation_0-error:0.34798	validation_0-logloss:0.63804	validation_1-error:0.35039	validation_1-logloss:0.63968
[88]	validation_0-error:0.34798	validation_0-logloss:0.63776	validation_1-error:0.35039	validation_1-logloss:0.63941
[89]	validation_0-error:0.34798	validation_0-logloss:0.63749	validation_1-error:0.35039	validation_1-logloss:0.63913
[90]	validation_0-error:0.34798	validation_0-logloss:0.63722	validation_1-error:0.35039	validation_1-logloss:0.63888
[91]	validation_0-error:0.34798	validation_0-logloss:0.63696	validation_1-error:0.35039	validation_1-logloss:0.63867
[92]	validation_0-error:0.34696	validation_0-logloss:0.63670	validation_1-error:0.35039	validation_1-logloss:0.63837
[93]	validation_0-error:0.34696	validation_0-logloss:0.63641	validation_1-error:0.35039	validation_1-logloss:0.63815
[94]	validation_0-error:0.34696	validation_0-logloss:0.63614	val

[156]	validation_0-error:0.34827	validation_0-logloss:0.62559	validation_1-error:0.34908	validation_1-logloss:0.62745
[157]	validation_0-error:0.34827	validation_0-logloss:0.62548	validation_1-error:0.34908	validation_1-logloss:0.62736
[158]	validation_0-error:0.34827	validation_0-logloss:0.62536	validation_1-error:0.34908	validation_1-logloss:0.62727
[159]	validation_0-error:0.34827	validation_0-logloss:0.62526	validation_1-error:0.34908	validation_1-logloss:0.62714
[160]	validation_0-error:0.34827	validation_0-logloss:0.62515	validation_1-error:0.34908	validation_1-logloss:0.62699
[161]	validation_0-error:0.34827	validation_0-logloss:0.62506	validation_1-error:0.34908	validation_1-logloss:0.62690
[162]	validation_0-error:0.34827	validation_0-logloss:0.62496	validation_1-error:0.34908	validation_1-logloss:0.62683
[163]	validation_0-error:0.34827	validation_0-logloss:0.62486	validation_1-error:0.34908	validation_1-logloss:0.62675
[164]	validation_0-error:0.34827	validation_0-logloss:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [61]:
# make predictions for test data
y_pred = xgb_model.predict(test_X)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 65.09%


In [58]:
test_negativity_score = test_dataset['text'].apply(negative_score)

In [63]:
y_test_pred = xgb_model.predict(np.vstack(test_negativity_score))

In [64]:
y_test_pred

array([1, 0, 1, ..., 1, 1, 0])

In [65]:
y_test_id = test_dataset['id']

In [70]:
data = [y_test_id, pd.Series(y_test_pred)]
headers = ["id", "target"]
submission = pd.concat(data, axis=1, keys=headers)

In [72]:
submission.to_csv('submission.csv', index=False)