<a href="https://colab.research.google.com/github/Jenisa-Merlin/HateHurter-ExposingToxicTexts/blob/main/BIOTAG_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.metrics import classification_report
import nltk
!pip install sklearn_crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
!pip uninstall sklearn-crfsuite
!pip install sklearn-crfsuite
!pip install -U scikit-learn
from sklearn_crfsuite import CRF
import sklearn_crfsuite.metrics as metrics
import joblib
!pip install Flask
from flask import Flask, request, jsonify, render_template

In [None]:
uploaded = files.upload()
data = pd.read_csv("data.csv", encoding="latin-1")
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data['sentence'] = data['sentence'].apply(lambda x: x.lower())

In [None]:
def prepare_data(data):
    sentences = []
    labels = []
    for item in data:
        sentence = data["sentence"].str.split()
        bio_labels = data["bio"].str.split()
        sentences.append(sentence)
        labels.append(bio_labels)
    return sentences, labels
sentences, labels = prepare_data(data)

In [None]:
def word2features(sent, i):
    word = sent[i]  # 'word' should be an individual token
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [None]:
sentences = data['sentence'].str.split()
labels = data['bio'].str.split()
X = [sent2features(sent) for sent in sentences]
y = labels

In [None]:
for i, (sentence, label) in enumerate(zip(sentences, labels), start=1):
    if len(sentence) != len(label):
        print(f"Mismatch found in sentence {i}:")
        print(f"Sentence tokens: {sentence}")
        print(f"BIO labels: {label}")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [None]:
predictions = crf.predict(X_val)
print(predictions)

In [None]:
new_sentence = input("Enter the sentence to predict BIO tags : ")
new_sentence_tokens = new_sentence.split()
new_sentence_features = sent2features(new_sentence_tokens)
predicted_tags = crf.predict([new_sentence_features])[0]
print("Predicted BIO Tags:", predicted_tags)

In [None]:
f1_score = metrics.flat_f1_score(y_val, predictions, average='weighted')
precision = metrics.flat_precision_score(y_val, predictions, average='weighted')
recall = metrics.flat_recall_score(y_val, predictions, average='weighted')
accuracy = metrics.flat_accuracy_score(y_val, predictions)
print(f"F1 score: {f1_score:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Accuracy: {accuracy:.2f}")

In [None]:
class_counts = y_val.value_counts()
print("Support for each class:")
print(class_counts)

In [None]:
joblib.dump(crf, '/content/drive/MyDrive/MLPROJECT/crf_model.pkl')

In [None]:
app = Flask(__name__)
crf_model = joblib.load("/content/drive/MyDrive/MLPROJECT/crf_model.pkl")
@app.route('/')
def front():
    return render_template('/content/drive/MyDrive/MLPROJECT/templates/front.html')
@app.route('/api/detect-hate-span', methods=['POST'])
def detect_hate_span():
    data = request.get_json()
    sentence = data.get('sentence', '')
    predicted_tags = predict_hate_span(sentence)
    result = {
        "sentence": sentence,
        "hateSpeechWords": predicted_tags
    }
    return jsonify(result)
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def predict_hate_span(sentence):
    sentence_tokens = sentence.split()
    sentence_features = [sent2features(sentence_tokens)]
    predicted_tags = crf_model.predict(sentence_features)[0]
    return predicted_tags
if __name__ == '__main__':
    app.run(port=5500)