In [31]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
import pickle

In [28]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

In [3]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,0,0,3,0,0,3,2,(this text should be deleted) !!! RT @mayasolo...
1,1,1,3,0,3,0,1,(this text should be deleted) !!!!! RT @mleew1...
2,2,2,3,0,3,0,1,(this text should be deleted) !!!!!!! RT @UrKi...
3,3,3,3,0,2,1,1,(this text should be deleted) !!!!!!!!! RT @C_...
4,4,4,6,0,6,0,1,(this text should be deleted) !!!!!!!!!!!!! RT...


In [4]:
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'count', 'hate_speech_count',
       'offensive_language_count', 'neither_count'],inplace=True)

In [5]:
df.head()

Unnamed: 0,class,tweet
0,2,(this text should be deleted) !!! RT @mayasolo...
1,1,(this text should be deleted) !!!!! RT @mleew1...
2,1,(this text should be deleted) !!!!!!! RT @UrKi...
3,1,(this text should be deleted) !!!!!!!!! RT @C_...
4,1,(this text should be deleted) !!!!!!!!!!!!! RT...


In [6]:
df['class'].unique()

array([2, 1, 0])

In [7]:
def remove_urls(text, replacement_text=""):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub(replacement_text, text)


def remove_twitter_handles(text, replacement_text=""):
    pattern = re.compile(r"@[\w]+")
    return pattern.sub(replacement_text, text)


def remove_twitter_rt(text, replacement_text=""):
    pattern = re.compile(r"^RT|\s+RT\s+")
    return pattern.sub(replacement_text, text)


def remove_alphanumerics(text, replacement_text=" "):
    pattern = re.compile(r"[^A-Za-z0-9']+")
    return pattern.sub(replacement_text, text)


def remove_multiple_whitespaces(text, replacement_text=" "):
    pattern = re.compile(r"\s{2,}")
    return pattern.sub(replacement_text, text)


def decode_html_character_references(text):
    import html
    return html.unescape(text)

In [8]:
df['new_tweet']=df['tweet'].apply(remove_urls).apply(remove_twitter_handles).apply(remove_twitter_rt).apply(remove_alphanumerics).apply(remove_multiple_whitespaces).apply(decode_html_character_references)

In [9]:
df.head()

Unnamed: 0,class,tweet,new_tweet
0,2,(this text should be deleted) !!! RT @mayasolo...,this text should be deleted As a woman you sh...
1,1,(this text should be deleted) !!!!! RT @mleew1...,this text should be deleted boy dats cold tyg...
2,1,(this text should be deleted) !!!!!!! RT @UrKi...,this text should be deleted Dawg You ever fuc...
3,1,(this text should be deleted) !!!!!!!!! RT @C_...,this text should be deleted she look like a t...
4,1,(this text should be deleted) !!!!!!!!!!!!! RT...,this text should be deleted The shit you hear...


In [13]:
def tokenize(doc):
    return word_tokenize(doc)


def remove_stopwords(doc):
    stops = set(stopwords.words("english"))
    stops.add("rt")
    return [token for token in doc if token not in stops]


def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"ain\'t", "are not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    phrase = re.sub(r"ma\'am", "maam", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

lemmatizer = WordNetLemmatizer()

In [23]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
df = (df
    .assign(
        tweet_preprocessed=lambda df_: (
            df_["new_tweet"]
            .str.lower()
            .apply(lambda doc: [decontracted(word) for word in doc.split(" ")])
            .apply(lambda doc: [lemmatizer.lemmatize(word) for word in doc])
            .apply(lambda doc: " ".join(doc))
            .apply(word_tokenize)
            .apply(remove_stopwords)
       )
    )
)

In [25]:
df.tail()

Unnamed: 0,class,tweet,new_tweet,tweet_preprocessed
24778,1,(this text should be deleted) you's a muthaf**...,this text should be deleted you's a muthaf in...,"[text, deleted, muthaf, lie, 8220, right, tl, ..."
24779,2,(this text should be deleted) you've gone and ...,this text should be deleted you've gone and b...,"[text, deleted, gone, broke, wrong, heart, bab..."
24780,1,(this text should be deleted) young buck wanna...,this text should be deleted young buck wanna ...,"[text, deleted, young, buck, wan, na, eat, dat..."
24781,1,(this text should be deleted) youu got wild bi...,this text should be deleted youu got wild bit...,"[text, deleted, youu, got, wild, bitch, tellin..."
24782,2,(this text should be deleted) ~~Ruffled | Ntac...,this text should be deleted Ruffled Ntac Eile...,"[text, deleted, ruffled, ntac, eileen, dahlia,..."


In [26]:
y = df["class"].values
x = df["tweet_preprocessed"].values
x = [(" ").join(doc) for  doc in x]

x = np.asarray(x)
y = np.asarray(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.1, random_state=8)

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

22304 22304
2479 2479


In [34]:
def return_score(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0.0)
    recall = recall_score(y_true, y_pred, average="weighted")
    matrix = confusion_matrix(y_true, y_pred, normalize="true")
    return f"Scores(acc:{acc}, f1:{f1}, precision:{precision})"



In [32]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=8)
for i, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

    print(f"\tFold {i}")

    # create train and val subset
    x_train_count = x_train[train_index]
    y_train_count = y_train[train_index]
    x_val_count = x_train[val_index]
    y_val_count = y_train[val_index]

    # oversample minority class in train set to deal with class imbalance
    # not oversample the val set to keep it untouched
    ros = RandomOverSampler(random_state=42)
    x_train_count, y_train_count = ros.fit_resample(x_train_count.reshape(-1, 1), y_train_count)
    x_train_count = x_train_count.flatten()


    vectorizer = CountVectorizer()
    vectorizer.fit(x_train_count)
    x_train_count = vectorizer.transform(x_train_count)
    x_val_count = vectorizer.transform(x_val_count)

    # Instantiate Random Forest model
    model = RandomForestClassifier()  # Assuming you've imported RandomForestClassifier

    # fit and predict
    model.fit(x_train_count, y_train_count)
    y_pred = model.predict(x_val_count)
    scores_this_fold = return_score(y_val_count, y_pred)

	Fold 0
	Fold 1
	Fold 2
	Fold 3


In [36]:
scores_this_fold

'Scores(acc:0.8945480631276901, f1:0.8953081448756453, precision:0.8971172290794728)'