In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
print("Training set has {} rows and {} columns".format(train.shape[0],train.shape[1]))

test = pd.read_csv('test.csv')
print("Test set has {} rows and {} columns".format(test.shape[0],test.shape[1]))

Training set has 31962 rows and 3 columns
Test set has 17197 rows and 2 columns


## Data Cleaning

In [3]:
import re

def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))

    return df

test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

### Handling Imbalanced data for Hate Speech Detection Model

In [5]:
from sklearn.utils import resample

train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

train_minority_upsampled = resample(train_minority, 
                                 replace=True,
                                 n_samples=len(train_majority),
                                 random_state=123)

train_upsampled = pd.concat([train_minority_upsampled, train_majority])

train_upsampled['label'].value_counts()

label
1    29720
0    29720
Name: count, dtype: int64

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 0)

In [8]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9694020398640091