In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
import pickle

In [None]:
# Importing dataset
data = pd.read_csv('Cyberbullying.csv')

In [None]:
labelencoder = LabelEncoder()
data['cyberbullying_type_encoded'] = labelencoder.fit_transform(data['text'])
data[['text', 'cyberbullying_type_encoded']].value_counts()

In [None]:
# preprocessing functions

# converting tweet text to lower case
def text_lower(text):
    return text.str.lower()

# removing stopwoords from the tweet text
def clean_stopwords(text):
    # stopwords list that needs to be excluded from the data
    stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
    STOPWORDS = set(stopwordlist)
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# cleaning and removing punctuations
def clean_puctuations(text):
    english_puctuations = string.punctuation
    translator = str.maketrans('','', english_puctuations)
    return text.translate(translator)
# cleaning and removing repeating characters
def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# cleaning and removing URLs
def clean_URLs(text):
    return re.sub(r"((www.[^s]+)|(http\S+))","",text)

# cleaning and removing numeric data
def clean_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenization of tweet text
def tokenize_tweet(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# stemming    
def text_stemming(text):
    st = PorterStemmer()
    text = [st.stem(word) for word in text]
    return text

# lemmatization
def text_lemmatization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text

In [None]:
# defining preprocess function

def preprocess(text):
    text = text_lower(text)
    text = text.apply(lambda text: clean_stopwords(text))
    text = text.apply(lambda x : clean_puctuations(x))
    text = text.apply(lambda x: clean_repeating_characters(x))
    text = text.apply(lambda x : clean_URLs(x))
    text = text.apply(lambda x: clean_numeric(x))
    text = tokenize_tweet(text)
    text = text.apply(lambda x: text_stemming(x))
    text = text.apply(lambda x: text_lemmatization(x))
    text = text.apply(lambda x : " ".join(x))
    return text

data['tweet_text'] = preprocess(data['tweet_text'])
data

In [None]:
# Splitting the data into train and test
X, y = data['tweet_text'], data['cyberbullying_type_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 41)

In [None]:
# Transforming the data using TF-IDF Vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features= 500000)
vectoriser.fit(X_train)
# print("No. of feature words: ",len(vectoriser.get_feature_names()))

In [None]:
# Dumping the vectoriser
pickle.dump(vectoriser, open('tdf_vectorizer', 'wb'))

In [None]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)

In [None]:
# Model
svm_model_linear = SVC(kernel= 'linear', C = 1).fit(X_train, y_train)
svm_predictions  = svm_model_linear.predict(X_test)
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

In [None]:
# dumping the model
pickle.dump(svm_model_linear, open('model.bin', 'wb'))

In [None]:
# # Function for custom input prediction
# def custom_input_prediction(text):
#     import nltk
#     nltk.download('omw-1.4')
#     text = pd.Series(text)
#     text = preprocess(text)
#     text = [text[0],]
#     vectoriser = pickle.load(open("tdf_vectorizer", "rb"))
#     text = vectoriser.transform(text)
#     model = pickle.load(open("model.bin", "rb"))
#     prediction = model.predict(text)
#     prediction = prediction[0]

#     interpretations = {
#         0 : "Bullying",
#         1 : "Non-Bullying",
#         2 : "Age",
#         3 : "Ethnicity",
#         4 : "Gender",
#         5 : "Not Cyberbullying",
#         6 : "Other Cyberbullying",
#         7 : "Religion"
#     }

#     for i in interpretations.keys():
#         if i == prediction:
#             return interpretations[i]
def custom_input_prediction(texts):
    import nltk
    nltk.download('omw-1.4')
    text_series = pd.Series(texts)
    predictions = []

    for text in text_series:
        preprocessed_text = preprocess(pd.Series(text))
        preprocessed_text = [preprocessed_text[0],]

        vectorizer = pickle.load(open("tdf_vectorizer", "rb"))
        text_vector = vectorizer.transform(preprocessed_text)

        model = pickle.load(open("model.bin", "rb"))
        prediction = model.predict(text_vector)
        predictions.append(prediction[0])

    # Count the occurrences of each bullying type
    type_counts = dict()
    for prediction in predictions:
        if prediction in type_counts:
            type_counts[prediction] += 1
        else:
            type_counts[prediction] = 1

    # Determine the most frequent bullying type
    most_frequent_type = max(type_counts, key=type_counts.get)

    interpretations = {
        0 : "Bullying",
        1 : "Non-Bullying",
        2 : "Age",
        3 : "Ethnicity",
        4 : "Gender",
        5 : "Not Cyberbullying",
        6 : "Other Cyberbullying",
        7 : "Religion"
    }

    most_frequent_type_name = interpretations.get(most_frequent_type, "Unknown")

    return most_frequent_type_name, type_counts  # Return both the most frequent type and the counts

In [None]:
something = "My Grandsons are angry about this gender free crap too! 2 in primary 2 @at high school T.he is 16 yr old ASD & got bullied as did a girl in his SEN base. He had to step in as teachers to busy on phones playing games, wee lass would have had nowhere to run if loos unisex!"
something_2 = "But for u its Hinduphobia isnt it? When kashmiri pandits get killed, when a hindu girl gets raped by islamists, when radical islamic terrorism kill people in the world,u still keep quiet as if nothing is happening;but jump on when some1 says anything against islam!! #Hinduphobic"
new_something = "There was certainly a more ""acceptable"" time for them to be made though in the eyes of our world at large (which also includes other jokes like rape, gaybashing, etc.) Shit, try watching Friends or Seinfeld and watch how many times they throw gay people under the bus for a laugh."
print(custom_input_prediction(something_2))