# **Importing the necessary libraries**

In [None]:
!pip install transformers

In [None]:
!pip install google-api-python-client

In [None]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from PIL import Image
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from transformers import RobertaTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import TFRobertaModel
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix,f1_score,classification_report
from sklearn.metrics import precision_recall_curve
from imblearn.over_sampling import SMOTE

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS

In [None]:
import nltk
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

# Loading the datasets

In [None]:
df_train = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv")

In [None]:
df_test = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv")

In [None]:
df_train

In [None]:
df_test

# **Checking for any NULL values in the dataset**

In [None]:
df_train.info()

In [None]:
df_test.info()

# **Positive and negative sentimental percentage in the train dataset**

In [None]:
pos = 100*len(df_train.loc[df_train['label']==0,'label'])/len(df_train['label'])
neg = 100*len(df_train.loc[df_train['label']==1,'label'])/len(df_train['label'])

In [None]:
print(pos)
print(neg)

# **Data Visualisation**

In [None]:
train_len = df_train['tweet'].str.len()
test_len = df_test['tweet'].str.len()

In [None]:
plt.hist(train_len, bins=20,label='train_tweets')
plt.hist(test_len , bins=20, label='test_tweets')
plt.legend()
plt.show()

In [None]:
sns.countplot(data=df_train, x='label', hue='label')
plt.title('Types of comments : 0 - > Non Rasict/Sexist , 1 - > Rasict/Sexist')
plt.xlabel('Tweets')
plt.show()

In [None]:
length_train = df_train['tweet'].str.len().plot.hist(color = 'blue', figsize = (6, 4))
length_test = df_test['tweet'].str.len().plot.hist(color = 'pink', figsize = (6, 4))

In [None]:
sns.countplot(x= 'label',data = df_train,palette="PRGn")
plt.title('Label Counts')
plt.show()

In [None]:
stop_words = stopwords.words('english')

In [None]:
wordcloud_mask=np.array(Image.open("/kaggle/input/wodcloud-twiter-pic/twitter.png"))

# Racist-Sexist-Data Wordcloud

In [None]:
racist_sexist_data=df_train[df_train.label==1]
racist_tweet_text=" ".join(racist_sexist_data["tweet"].tolist())
plt.figure(figsize=(10,10))
wordcloud=WordCloud(stopwords=stop_words,width=1600,height=800,max_words=100,mask=wordcloud_mask,colormap='Paired').generate(racist_tweet_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
plt.show()

# Positive Data Wordcloud

In [None]:
positive_data=df_train[df_train.label==0]
positive_tweet_text=" ".join(positive_data["tweet"].tolist())
plt.figure(figsize=(10,10))
wordcloud=WordCloud(stopwords=stop_words,width=1600,height=800,max_words=100,mask=wordcloud_mask,colormap='Dark2').generate(positive_tweet_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
plt.show()

# **Data Pre-processing and Cleaning**

In [None]:
Word = WordNetLemmatizer()

In [None]:
def clean(raw): #Hyperlink and markup removal
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result) # greater than sign
    result = re.sub('&#x27;', "'", result) # apostrophe
#     result = re.sub('&quot;', '"', result) 
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result) # paragraph tag
    result = re.sub('<i>', ' ', result) #italics tag
    result = re.sub('</i>', '', result) 
    result = re.sub('&#62;', '', result)
    result = re.sub("\n", '', result) # newline 
    return result

In [None]:
def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', text)
    return text

In [None]:
def lower_case(df_train):
    df_train['tweet'] = df_train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
def stop_words_remove(df):
    df_train['tweet'] = df_train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [None]:
def remove_word_user(df):    
    df_train['tweet'] = df_train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x != 'user'))

In [None]:
def lemmatize(df):
    df_train['tweet'] = df_train['tweet'].apply(lambda x: " ".join([Word.lemmatize(word) for word in x.split()]))

In [None]:
def cleaning(df_train):
    df_train['tweet'] = df_train['tweet'].apply(clean)
    df_train['tweet'] = df_train['tweet'].apply(deEmojify)
    df_train['tweet'] = df_train['tweet'].apply(lambda x: remove_punct(x))
    lower_case(df_train)
    stop_words_remove(df_train)
    remove_word_user(df_train)
#     rare_words_removal(df)
#     spell_correction(df)
    lemmatize(df_train)

In [None]:
cleaning(df_train)

In [None]:
df_train

In [None]:
df_train.drop("id",axis=1)

In [None]:
max_len=128
text_data=df_train["tweet"]
label_data=df_train["label"]

In [None]:
print("Length of Text Data :",len(text_data))
print("Length of Label Data :",len(label_data))

# RoBERTa tokenizer

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# RoBERTa model

In [None]:
roberta_model = TFRobertaModel.from_pretrained("roberta-base")

In [None]:
final_text_data=df_train["tweet"]
final_label_data=np.array(label_data)

# Encoding the input text to id's and attention masks

In [None]:
def roberta_encode(final_text_data,max_len):
    input_ids=[]
    attention_masks=[]
    
    for i in range(len(final_text_data)):
        encode_data=roberta_tokenizer.encode_plus(final_text_data[i],add_special_tokens=True,max_length=max_len,pad_to_max_length=True,return_attention_mask=True)
        input_ids.append(encode_data['input_ids'])
        attention_masks.append(encode_data["attention_mask"])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
text_input_ids,text_attention_masks = roberta_encode(final_text_data,max_len)

In [None]:
print('Text Input Ids Shape {} \nText Input Attention Mask Shape {} \nLabel Data shape {}'.format(text_input_ids.shape,text_attention_masks.shape,final_label_data.shape))

In [None]:
X_train_input,X_test_input,Y_train_label,Y_test_label,train_mask,test_mask=train_test_split(text_input_ids,final_label_data,text_attention_masks,test_size=0.2,random_state=42,shuffle=True)

In [None]:
print('Train input shape {}\nTest input shape {}\nTrain label shape {}\nTest label shape {}\nTrain attention mask shape {}\nTest attention mask shape {}'.format(X_train_input.shape,X_test_input.shape,Y_train_label.shape,Y_test_label.shape,train_mask.shape,test_mask.shape))

# Creating the RoBERTa model

In [None]:
def Create_Roberta_Model():
    input_ids=tf.keras.Input(shape=(max_len,),dtype="int32")
    attention_masks=tf.keras.Input(shape=(max_len,),dtype="int32")
    
    roberta_model = TFRobertaModel.from_pretrained("roberta-base")
    output_dim = roberta_model(input_ids=input_ids, attention_mask=attention_masks)[0][:,0,:]
    
    dense_layer = tf.keras.layers.Dense(128, activation="relu")(output_dim)
    dropout = tf.keras.layers.Dropout(0.5)(dense_layer)
    final_layer = tf.keras.layers.Dense(2, activation='softmax')(dropout)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=final_layer)
    
    return model

In [None]:
model=Create_Roberta_Model()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

optimizer = tf.keras.optimizers.Adam(lr=1e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=["accuracy"])

In [None]:
history=model.fit([X_train_input,train_mask],Y_train_label,batch_size=42,epochs=10,validation_data=([X_test_input,test_mask],Y_test_label))

# Evalutation metrics

In [None]:
def model_loss_and_accuracy(history):
    
    fig=plt.figure(figsize=(15,15))
    plt.subplot(221)
    plt.plot(history.history["accuracy"],marker="o",linestyle=":",markersize=10,color="m",label="accuracy")
    plt.plot(history.history["val_accuracy"],marker="D",linestyle=":",markersize=10,color="b",label="val_accuracy")
    plt.title("Model Accuracy\n",fontsize=20,color="darkorange")
    plt.xlabel("Number of Epochs",color="midnightblue",fontsize=15)
    plt.ylabel("Accuracy",color="midnightblue",fontsize=15)
    plt.grid(color = 'green', linestyle = '--', linewidth = 1)
    plt.legend(loc="best")
    plt.tight_layout()
    
  
    plt.subplot(222)
    plt.plot(history.history["accuracy"],marker="v",linestyle="-.",markersize=10,color="g",label="accuracy")
    plt.plot(history.history["val_accuracy"],marker="s",linestyle="--",markersize=10,color="r",label="val_accuracy")
    plt.title("Model Loss\n",fontsize=20,color="limegreen")
    plt.xlabel("Number of Epochs",color="midnightblue",fontsize=15)
    plt.ylabel("Loss",color="midnightblue",fontsize=15)
    plt.grid(color = 'green', linestyle = '--', linewidth = 1)
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
model_loss_and_accuracy(history)

# Label Names

In [None]:
label_name=["Positive","Racist and Sexist"]
pred=model.predict([X_test_input, test_mask])
prediction=np.argmax(pred, axis=1)

# Accuracy Score

In [None]:
print("Accuracy Score is",accuracy_score(Y_test_label,prediction))

# ROC-AUC score

In [None]:
print("ROC AUC Score is {}".format(roc_auc_score(Y_test_label, pred[:,1])))

# ROC curve

In [None]:
pred_positive = pred[:,1]
fpr, tpr, thresholds = roc_curve(Y_test_label, pred_positive)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc,color="g",linestyle="--",marker="o",markersize=3,markerfacecolor="k")

plt.plot([0, 1], [0, 1],linestyle="--",linewidth=3,color="m")  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# **Other Classifier models**

# Splitting the original data

In [None]:
X = df_train.drop(columns=['label'])
y = df_train['label']
test = df_test
print(X.shape, test.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Tokenization and Vectorization

In [None]:
def tokenize_and_clean(text):    
    # Tokenization
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if re.match(r'\w{1,}', token)]
    
    # Stemming
    stemmer = PorterStemmer()
    stems = [stemmer.stem(token) for token in filtered_tokens if token not in stop_words]
    return stems

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_clean)
X_train_tweets_tfidf = tfidf_vectorizer.fit_transform(X_train['tweet'])
X_test_tweets_tfidf = tfidf_vectorizer.transform(X_test['tweet'])
print(X_train_tweets_tfidf.shape, X_test_tweets_tfidf.shape)

# TF-IDF Vectorization on full training data
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_clean)
X_tweets_tfidf = tfidf_vectorizer.fit_transform(X['tweet'])
test_tweets_tfidf = tfidf_vectorizer.transform(test['tweet'])
print(X_tweets_tfidf.shape, test_tweets_tfidf.shape)

# SMOTE

In [None]:
plt.pie(y_train.value_counts(), 
        labels=['Label 0 (Positive Tweets)', 'Label 1 (Negative Tweets)'], 
        autopct='%0.1f%%')
plt.axis('equal')
plt.show()

In [None]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_tweets_tfidf, y_train.values)
print(X_train_smote.shape, y_train_smote.shape)

# SMOTE on full training data
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_tweets_tfidf, y.values)
print(X_smote.shape, y_smote.shape)

# Class Imbalance Check
plt.pie(pd.value_counts(y_train_smote), 
        labels=['Label 0 (Positive Tweets)', 'Label 1 (Negative Tweets)'], 
        autopct='%0.1f%%')
plt.axis('equal')
plt.show()

# Training various Classifiers

In [None]:
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores: Accuracy={acc}, F1-Score={f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Validation Scores: Accuracy={acc}, F1-Score={f1}')

**Logistic Regression**

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_smote, y_train_smote)
y_train_pred = lr.predict(X_train_smote)
y_test_pred = lr.predict(X_test_tweets_tfidf)
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

**Naive Bayes Classifier**

In [None]:
# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train_smote, y_train_smote)
y_train_pred = mnb.predict(X_train_smote)
y_test_pred = mnb.predict(X_test_tweets_tfidf)
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

**Random Forest Classifier**

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train_smote, y_train_smote)
y_train_pred = rf.predict(X_train_smote)
y_test_pred = rf.predict(X_test_tweets_tfidf)
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

**SVM**

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train_smote, y_train_smote)
y_train_pred = svm.predict(X_train_smote)
y_test_pred = svm.predict(X_test_tweets_tfidf)
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)