In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
import sklearn
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import imblearn
from imblearn.combine import SMOTEENN 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score
from numpy import mean
from numpy import std
import re

## Data Collection

In [2]:
#Import hate-speech annotated database
def load_data(path):
    """
    Function load_data, given a path reads the csv file making sure to not include the headers
    :param path: path variable, normally just the name of the file in current directory.
    :type path: string
    :return type: panda dataframe
    :return: df, dataframe containing data from the file at the path given
    """
    df = pd.read_csv(path,header=None)
    return df


## Data Cleaning

In [3]:
def clean_text(df, text_field):
    """
    Function clean_text, cleans the initial dataset and removes all non essential characters on the data (normally tweets)
    :param df: df, type dataframe to be cleaned
    :param text_field: text_field, type integer representing the index of the strings
    :return: df, dataframe containing cleaned data
    """
    newclass = []
    for i in df[0]:
        if i == 0 or i == 1:
            newclass.append(1)
        if i == 2:
            newclass.append(2)
    df[0] = newclass
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem).strip())
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r"^rt", "", elem).strip())
    df[text_field] = df[text_field].apply(lambda elem: ' '.join(elem.split()))
    return df


In [4]:
def random_undersampling(df, x,class_var):
    """
    Function random_undersampling performs random undersampling of dataframe df to remove x amount of class class_var
    :param df: df, type dataframe to be undersampled
    :param x: x, type integer for the number of samples to be removed from dataset
    :param class_var: class_var, type integer representing the class to be removed (1 for offensive 2 for non-offensive)
    :return: df, dataframe containing undersampled data
    """
    array_list = list(range(0, len(df)))
    random.shuffle(array_list)
    count = 0
    for i in array_list:
        if df[0][i] == class_var:
            df = df.drop(i)
            count+=1
        if count == x:
            break
    return df
        

## Data Splitting, Testing and Validation

In [5]:
#cross validation
def test_model(df):
    """
    Function test_model, performing a kfold cross validation of the model to ensure good accuracy and std.
    :param df: df to be tested
    :prints: stats to do with the accuracy of model
    """
    pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()),
                               ('model', SVC(C=10, 
                                             gamma=0.1, 
                                             kernel='sigmoid', 
                                             random_state=42))])
    cv = KFold(n_splits=10, random_state=None)

    acc_score = []
    scores = cross_val_score(pipeline, df[1],df[0], scoring='accuracy', cv=cv, n_jobs=-1)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


In [6]:
def run_model(df):
    """
    Function run_model, which runs the model using tfidvectorizing and svc for labeled classification
    :param df:
    :return: Returns model, an object encapsulates our trained model
    """
    pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()),
                               ('model', SVC(C=10, 
                                             gamma=0.1, 
                                             kernel='sigmoid', 
                                             random_state=42))])
    X_train, X_test, y_train, y_test = train_test_split(df[1],df[0])
    model = pipeline.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print(f1_score(y_test, y_predict))
    return model


In [7]:
def predict(model, string):
    """
    Function predict, takes the trained model and the string given and tests to see if it is offensive or not according to the model
    :param model:
    :param string: string, type string representing the text to be tested
    :return: Returns array of integer 1 for offensive 2 for non offensive
    """
    return model.predict([string])

In [8]:
def main():
    """
    Function main returns our intial trained model on just the tweet dataset
    :return: Returns model, an object encapsulates our trained model
    """
    path = 'labeled_data_original.csv'
    df = load_data(path)
    clean_text(df,1)
    df = random_undersampling(df, 10000,1)       
    print(df[0].value_counts())
    df.to_csv('labeled_data.csv',index=False,header=False)
    model = run_model(df)
    return model

def write_newline(line):
    """
    Function write_newline, takes a new offensive line and adds it to the dataset, and then the model is retrained
    :param line: type string, 
    :return: Returns model now trained on the new offensive data given.
    """
    line = line.lower()
    line = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", line).strip()
    line = re.sub(r"^rt", "", line).strip()
    line =' '.join(line.split())
    path = 'labeled_data.csv'
    df = load_data(path)

    if line == "chocolate":

        for i in range(20):
            new = pd.DataFrame({0:[1],1:[line]})

            df = pd.concat([df,new])
    else:
        new = pd.DataFrame({0:[1],1:[line]})

        df = pd.concat([df,df2])
    df.to_csv('labeled_data.csv',index=False,header=False)
    model = run_model(df)
    return model
    
    
    