In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [157]:
!pip install pycodestyle
!pip install --index-url https://test.pypi.org/simple/ nbpep8

In [158]:
from nbpep8.nbpep8 import pep8

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import callbacks, models, layers
import matplotlib.pyplot as plt

# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix,f1_score,classification_report

In [5]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [161]:
train.info()

In [162]:
test.info()

In [163]:
train.head()

In [164]:
test.head()

In [165]:
sentiment_counts = train.groupby(['target']).size()
print(sentiment_counts)

In [166]:
sns.countplot(x = 'target', data = train, palette = 'Set3')
plt.xticks(ticks = [0,1], labels = ['Disaster','Non-Disaster'])
plt.ylabel("Count")
plt.xlabel("Target")
plt.title("Distribution of target label")
plt.show()

In [167]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
 
# Wordcloud with positive tweets
positive_tweets = train['text'][train["target"] == 1]
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
positive_wordcloud = WordCloud(max_font_size=50, max_words=50,\
                               background_color="white", stopwords = stop_words). \
generate(str(positive_tweets))
plt.figure()
plt.title("Non_Disaster Tweets - Wordcloud")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
 
# Wordcloud with negative tweets
negative_tweets = train['text'][train["target"] == 0]
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
Disaster_wordcloud = WordCloud(max_font_size=50, max_words=50, \
                               background_color="white", stopwords = stop_words). \
generate(str(negative_tweets))
plt.figure()
plt.title("Disaster Tweets - Wordcloud")
plt.imshow(Disaster_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [168]:
def missing_values(df):
    nb_missing = df.isnull().sum()
    percent_missing = 100 * df.isnull().sum() / len(df)
    missing_table = pd.concat([nb_missing, percent_missing], axis=1,
                              keys=['Nb of missing values', '% of missing values'])
    missing_table_sorted = missing_table[
        missing_table.iloc[:, 1] != 0].sort_values(
        '% of missing values', ascending=False).round(2)
    
    return missing_table_sorted

missing_values(train)

In [169]:
missing_values(test)

In [170]:
import plotly.express as px
def feature_viz(df,feature):
    
    '''Input- df=pandas dataframe
              feature= column to be charted
       Output- bar and scatter chart using plotly       
    
    '''
    #Visualize the feature
    if feature=='target':
        sns.countplot(feature, data=df)
        print('Target of 0 is {} % of total'.format(round(df[feature].value_counts()[0]/len(df[feature])*100)))
        print('Target of 1 is {} % of total'.format(round(df[feature].value_counts()[1]/len(df[feature])*100)))
    else:
        #Distinct keywords in train dataset
        feat=df[feature].value_counts()
        print(feat.head())
        fig = px.scatter(feat, x=feat.values, y=feat.index,size=feat.values)
        fig.show()

In [171]:
feature_viz(train, 'keyword')

In [22]:
train['text_length'] = train['text'].apply(lambda x : len(x.split(' ')))
#Create visualization of the distribution of text length in comparision to target feature
f, (ax1, ax2) = plt.subplots(1, 2, sharex=True,figsize=(10,6))
sns.histplot(train[(train['target'] == 1)]['text_length'], ax=ax1, kde=False, color='red',label='Disater Tweets')
sns.histplot(train[(train['target'] == 0)]['text_length'],ax=ax2, kde=False, color='green',label='Non-Disater Tweets');
f.suptitle('Tweet length distribution')
f.legend(loc='upper right')
ax1.grid()
ax2.grid()
plt.show()

In [21]:
train['length'] = train['text'].apply(lambda x : len(x.split(' ')))
train['length'].max()

In [173]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [174]:
stemmer = SnowballStemmer('english')
def clean_text(each_text):
    clean_html = BeautifulSoup(each_text).get_text()

    # remove URL from text
    #each_text_no_url = re.sub(r"http\S+", "", clean_html)
    
    # remove numbers from text
    text_no_num = re.sub(r'\d+', '', clean_html)

    # tokenize each text
    word_tokens = word_tokenize(text_no_num)
    
    # remove sptial character
    clean_text = []
    for word in word_tokens:
        clean_text.append("".join([e for e in word if e.isalnum()]))

    # remove stop words and lower
    text_with_no_stop_word = [w.lower() for w in clean_text if not w in stop_words]  

    # do stemming
    stemmed_text = [stemmer.stem(w) for w in text_with_no_stop_word]
    
    return " ".join(" ".join(stemmed_text).split())


In [175]:
train["cleaned_text"] = train["text"].apply(clean_text)
train

In [176]:
train["keyword_1"] = train["keyword"].fillna("none")
train["cleaned_keyword"] = train["keyword_1"].apply(clean_text)
train["cleaned_keyword"]

In [177]:
# Combine column 'clean_keyword' and 'clean_text' into one
train['keyword_text'] = train['cleaned_keyword'] + " " + train["cleaned_text"]
train['keyword_text']

In [6]:
!pip install transformers
!pip install pytorch-transformers
# Import des librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import *
import time
# tf.compat.v1.disable_eager_execution()
path = "/content/"
print(tf.__version__)

In [8]:
tweets_raw = train['text']
y = train['target'].values

In [9]:
tweet_raw_train, tweet_raw_test, y_train, y_test = train_test_split(
    tweets_raw.values, y,
    test_size=0.2,
    stratify=y,
    random_state=7
)

In [10]:
train_label = y_train
test_label  = y_test

In [11]:
# Preparing the sentences
def data_prep_fct(bert_tokenizer, sentences, max_length) :
    
    input_ids=[]
    attention_masks=[]
    token_type_ids=[]
    segment_ids=[]

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              truncation=True,
                                              return_attention_mask = True, 
                                              return_token_type_ids=True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
        token_type_ids.append(bert_inp['token_type_ids'])
        segment_id = [0] * max_length
        segment_ids.append(segment_id)

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    token_type_ids = np.array(token_type_ids)
    segment_ids = np.array(segment_ids)
    
#     return input_ids, attention_masks
    return input_ids, attention_masks, token_type_ids, segment_ids

In [12]:
def train_test_prep_fct(bert_tokenizer) :

    print("Train tweets preparation ...")
    sentences = tweet_raw_train
    start = time.time()
    train_inp, train_mask, train_token, train_seg = data_prep_fct(
        bert_tokenizer,
        sentences,
        max_length=max_length
    )
    print("duration: ", time.time()-start)
    print()

    print("Test tweets preparation ...")
    sentences = tweet_raw_test
    start = time.time()
    test_inp, test_mask, test_token, test_seg = data_prep_fct(
        bert_tokenizer,
        sentences,
        max_length=max_length
    )
    print("duration: ", time.time()-start)
    
    return train_inp, train_mask,train_token,train_seg, test_inp, test_mask,test_token, test_seg

In [184]:
model_name = 'bert'

model_type   = 'hkayesh/twitter-disaster-nlp'
#'finiteautomata/bertweet-base-sentiment-analysis'

max_length   = 64
bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
#AutoTokenizer.from_pretrained(model_type)

mt = (model_type.split('-')[0][0] + model_type.split('-')[1][0] + model_type.split('-')[2][0]).upper()
ml = 'ML' + str(max_length)
tw = 'T' + str(len(tweets_raw))
model_file_name = model_name + '_' + mt + '_' + ml + '_' + tw
model_save_path = model_file_name + '.h5'
# = path + 'models/' + model_file_name + '.h5'
print(model_save_path)

In [185]:
train_inp, train_mask, train_token, train_seg, test_inp, test_mask, test_token, test_seg = \
                                                                train_test_prep_fct(bert_tokenizer)

In [186]:
train_inp.shape

In [187]:
bert_model = TFBertForSequenceClassification.from_pretrained(model_type, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08 ) 
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')


bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
print(bert_model.summary())

In [188]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                                    save_weights_only=True,
                                                    monitor='val_loss',mode='min',
                                                    save_best_only=True, verbose=1)
callbacks = [es, mc]

history = bert_model.fit(
                         [train_inp, train_mask, train_seg], train_label,                         
                         batch_size=4, epochs=3 ,
                         validation_data=([test_inp, test_mask, test_seg],test_label),
                         callbacks=callbacks, verbose=1)

In [189]:
trained_model = bert_model
trained_model.save_weights(model_save_path)
trained_model.load_weights(model_save_path)

In [190]:
from sklearn import metrics
import matplotlib.pyplot as plt

In [191]:
y_pred_proba = trained_model.predict([test_inp, test_mask, test_seg], batch_size=4)[0][:,1]
y_pred = np.where(y_pred_proba>0,1,0)
print("accuracy : ", metrics.accuracy_score(y_test,y_pred))
print("auc      : ", metrics.roc_auc_score(y_test,y_pred_proba))

In [192]:
fpr, tpr, thresh = metrics.roc_curve(y_test,y_pred_proba)
auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label="BERT, AUC="+str(round(auc,4)))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

### New code

In [193]:
train = train.sample(frac=1).reset_index()
train.shape

In [194]:
train_sentences = train["text"]
labels = train["target"]
test_sentences = test["text"]

In [195]:
def data_prep_fct_1(bert_tokenizer, sentences, max_length=128) :
    
    input_ids=[]
    attention_masks=[]
    token_type_ids=[]
    segment_ids=[]

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              truncation=True,
                                              pad_to_max_length = True,
                                              return_attention_mask = True, 
                                              return_token_type_ids=True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    
    return input_ids, attention_masks

In [196]:
# Modèle
model_name    = 'bert'
model_type = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
do_lower_case = True
max_length   = 64
# Train_test_split
test_size    = 0.2
random_state = 5

# Hyperparameters of Model
learning_rate = 1e-5
epsilon=1e-08
batch_size = 4

In [197]:
roberta_tokenizer = AutoTokenizer.from_pretrained(model_type, do_lower_case=do_lower_case)
start = time.time()
input_ids, attention_masks = data_prep_fct_1(roberta_tokenizer, train_sentences, max_length=max_length)
print("duration : ", time.time()-start)

In [198]:
train_inp, val_inp, train_label, val_label, train_mask, val_mask = \
    train_test_split(input_ids, labels, attention_masks,
                     stratify=labels, test_size=test_size, random_state=random_state)

In [199]:
import tensorflow_addons as tfa

model_save_path = 'RoBerta_cardiff.h5'
roberta_model = TFRobertaForSequenceClassification.from_pretrained(model_type,num_labels=2)
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                                save_weights_only=True,
                                                monitor='val_loss',mode='min',
                                                save_best_only=True, verbose=1)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)

roberta_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])
print(roberta_model.summary())

In [200]:
history = roberta_model.fit(
    [train_inp,train_mask], train_label,
    batch_size=batch_size, epochs=1,
    validation_data=([val_inp,val_mask],val_label),
    callbacks=callbacks, verbose=1)

In [201]:
start = time.time()
test_input_ids, test_attention_masks = data_prep_fct_1(roberta_tokenizer, test_sentences, max_length=max_length)
print("Test Sentences preparation duration : ", time.time()-start)

In [202]:
predictions = roberta_model.predict([test_input_ids,test_attention_masks])

In [203]:
y_pred_proba = predictions[0][:,1]
y_pred = np.where(y_pred_proba>0,1,0)

In [204]:
output = pd.DataFrame(data={"id":test.id, "target":y_pred})
output.to_csv("submission.csv", index=False, quoting=3)

### test roberta 

In [13]:
model_name = 'roberta_V2'
model_type   = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
max_length   = 64
roberta_V2_tokenizer = AutoTokenizer.from_pretrained(model_type)


mt = (model_type.split('-')[0][0] + model_type.split('-')[1][0] + model_type.split('-')[2][0]).upper()
ml = 'ML' + str(max_length)
tw = 'T' + str(len(tweets_raw))
model_file_name = model_name + '_' + mt + '_' + ml + '_' + tw
model_save_path = model_file_name + '.h5'
print(model_save_path)

In [14]:
train_inp, train_mask, train_token, train_seg, test_inp, test_mask, test_token, test_seg = \
                                                                train_test_prep_fct(roberta_V2_tokenizer)

In [15]:
roberta_V2_model = TFRobertaForSequenceClassification.from_pretrained(model_type,num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08 ) 
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
roberta_V2_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
print(roberta_V2_model.summary())

In [17]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                                    save_weights_only=True,
                                                    monitor='val_loss',mode='min',
                                                    save_best_only=True, verbose=1)
callbacks = [es, mc]

history = roberta_V2_model.fit(
    [train_inp, train_mask, train_seg], train_label,
    batch_size=4, epochs=1,
    validation_data=([test_inp, test_mask, test_seg],test_label),
    callbacks=callbacks, verbose=1
)

In [18]:
trained_model = roberta_V2_model
trained_model.save_weights(model_save_path)
trained_model.load_weights(model_save_path)

In [19]:
y_pred_proba = trained_model.predict([test_inp, test_mask, test_seg], batch_size=4)[0][:,1]
y_pred = np.where(y_pred_proba>0,1,0)
print("accuracy : ", metrics.accuracy_score(y_test,y_pred))
print("auc      : ", metrics.roc_auc_score(y_test,y_pred_proba))

In [20]:
fpr, tpr, thresh = metrics.roc_curve(y_test,y_pred_proba)
auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label="RoBERTa, AUC="+str(round(auc,4)))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=np.unique(y_pred)))