In [1]:
import pandas as pd

import sklearn

## Data Collection

In [2]:
#Import hate-speech annotated database
df = pd.read_csv('labeled_data.csv',header=None)
print(df)

       0                                                  1
0      2  !!! RT @mayasolovely: As a woman you shouldn't...
1      1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2      1  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3      1  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4      1  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...   ..                                                ...
24778  1  you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779  2  you've gone and broke the wrong heart baby, an...
24780  1  young buck wanna eat!!.. dat nigguh like I ain...
24781  1              youu got wild bitches tellin you lies
24782  2  ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...

[24783 rows x 2 columns]


## Data Cleaning

In [3]:
#Clean Data and Classes

newclass = []
for i in df[0]:
    if i == 0 or i == 1:
        newclass.append(1)
    if i == 2:
        newclass.append(2)
df[0] = newclass
#There is now no "hate speech" class and text is deemed as either offensive or inoffensive




In [4]:
#we changed the tweets into lower case. Also,
#we removed all the URLs, usernames, white spaces, hashtags,
#punctuations and stop-words using pattern matching
#techniques from the collected tweets
#We will use RE to pattern match to just return the tweets 
import re
def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem).strip())
    #df[text_field] = df[text_field].apply(lambda elem: [word for word in word_tokenize(elem) if not word in stopwords.words()])
    return df

df = clean_text(df,1)



## Feature Extraction

In [5]:
#Feature Extraction / Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

pipeline_svc = Pipeline([
    ('tfidfT',  TfidfTransformer()),
    ('svc', SVC())])

## Data Splitting, Testing and Validation

In [6]:
import imblearn
from imblearn.combine import SMOTEENN 


from sklearn.model_selection import train_test_split
smt = SMOTEENN(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df[1],df[0])
vectorizer = TfidfVectorizer(stop_words='english',  ngram_range=(2, 2))
vectorizer.fit(X_train.values.ravel())
X_train=vectorizer.transform(X_train.values.ravel())
X_test=vectorizer.transform(X_test.values.ravel())
X_train=X_train.toarray()
X_test=X_test.toarray()
X_train, y_train = smt.fit_resample(X_train, y_train)
y_train.value_counts()




2    12585
1     1547
Name: 0, dtype: int64

In [10]:
model = pipeline_svc.fit(list(X_train),list(y_train))
y_predict = model.predict(list(X_test))
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.18925110132158593

In [19]:
print(model.predict(vectorizer.transform([""])))

[2]
