In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
import sklearn
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import imblearn
from imblearn.combine import SMOTEENN 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score
from numpy import mean
from numpy import std
import re

## Data Collection

In [2]:
#Import hate-speech annotated database
def load_data(path):
    df = pd.read_csv(path,header=None)
    return df


## Data Cleaning

In [3]:
def clean_text(df, text_field):
    newclass = []
    for i in df[0]:
        if i == 0 or i == 1:
            newclass.append(1)
        if i == 2:
            newclass.append(2)
    df[0] = newclass
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem).strip())
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r"^rt", "", elem).strip())
    df[text_field] = df[text_field].apply(lambda elem: ' '.join(elem.split()))
    return df


In [4]:
def random_undersampling(df, x,class_var):
    array_list = list(range(0, len(df)))
    random.shuffle(array_list)
    count = 0
    for i in array_list:
        if df[0][i] == class_var:
            df = df.drop(i)
            count+=1
        if count == x:
            break
    return df
        

## Data Splitting, Testing and Validation

In [5]:
#cross validation
def test_model(df):
    pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()),
                               ('model', SVC(C=10, 
                                             gamma=0.1, 
                                             kernel='sigmoid', 
                                             random_state=42))])
    cv = KFold(n_splits=10, random_state=None)

    acc_score = []
    scores = cross_val_score(pipeline, df[1],df[0], scoring='accuracy', cv=cv, n_jobs=-1)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


In [6]:
def run_model(df):
    pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()),
                               ('model', SVC(C=10, 
                                             gamma=0.1, 
                                             kernel='sigmoid', 
                                             random_state=42))])
    X_train, X_test, y_train, y_test = train_test_split(df[1],df[0])
    model = pipeline.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print(f1_score(y_test, y_predict))
    return model


In [7]:
def predict(model, string):
    return model.predict([string])

In [20]:
def main():
    path = 'labeled_data_original.csv'
    df = load_data(path)
    clean_text(df,1)
    df = random_undersampling(df, 10000,1)       
    print(df[0].value_counts())
    df.to_csv('labeled_data.csv',index=False,header=False)
    model = run_model(df)
    return model




def write_newline(line):
    line = line.lower()
    line = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", line).strip()
    line = re.sub(r"^rt", "", line).strip()
    line =' '.join(line.split())
    path = 'labeled_data.csv'
    df = load_data(path)

    if line == "chocolate":

        for i in range(20):
            new = pd.DataFrame({0:[1],1:[line]})

            df = pd.concat([df,new])
    else:
        new = pd.DataFrame({0:[1],1:[line]})

        df = pd.concat([df,df2])
    print(df)
    df.to_csv('labeled_data.csv',index=False,header=False)
    model = run_model(df)
    return model
    
    
    

In [21]:
main()
write_newline("chocolate")

1    10620
2     4163
Name: 0, dtype: int64
0.9576975082093877
    0                                                  1
0   2  as a woman you shouldnt complain about cleanin...
1   1  the shit you hear about me might be true or it...
2   1  8220 cause im tired of you big bitches coming ...
3   1  keeks is a bitch she curves everyone lol i wal...
4   1                     murda gang bitch its gang land
.. ..                                                ...
0   1                                          chocolate
0   1                                          chocolate
0   1                                          chocolate
0   1                                          chocolate
0   1                                          chocolate

[14803 rows x 2 columns]
0.9588728103579589


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('model',
                 SVC(C=10, gamma=0.1, kernel='sigmoid', random_state=42))])