# Feature Engineering


## Importss

In [1]:
import pandas as pd
import nltk
import time

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression



## Data Extract

In [None]:
# read from csv in data
data = pd.read_csv('data/all_data.csv')

In [None]:
# remove null comment_text
data = data.dropna(subset=['comment_text'])

In [None]:
data['toxic'] = (data['toxicity'] >= 0.5).astype(int)

## Function for calculations

In [2]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## Part Of Speech Tagging

- Part-of-Speech (POS) tagging assigns grammatical tags (such as noun, verb, adjective, etc.) to each word in a text to indicate its syntactic role within a sentence.
- POS tagging is useful for toxic comment classification as it provides insights into the grammatical structure of text, helping algorithms understand the linguistic context and identify potentially harmful language patterns more effectively.

In [None]:
# Function to perform POS tagging
def pos_tagging(text):
    tokens = word_tokenize(text)  # Tokenize the text
    pos_tags = nltk.pos_tag(tokens)  # Perform POS tagging
    return pos_tags

In [None]:
#create new column for pos

data['pos_tags'] = data['comment_text'].apply(pos_tagging)

## RFC

In [None]:
# Flatten the list of tuples into a string format
data['pos_tags_str'] = data['pos_tags'].apply(lambda tags: ' '.join([tag[1] for tag in tags]))


In [None]:
data.to_csv('data/pos_data.csv', index=False)
# save to csv so that can new columns

In [3]:
# take data from pos_data
#data = pd.read_csv('data/pos_data.csv',nrows=100000)
data = pd.read_csv('data/pos_data.csv')


In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['pos_tags_str'])

In [5]:
data['toxic'] = (data['toxicity'] >= 0.5).astype(int)
y = data['toxic']


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Use the evaluate_model function to assess the model
results = evaluate_model(rfc, X_train, y_train, X_test, y_test, model_name="Random Forest Classifier",parameters='n_estimators=100',comments="POS + RFC")

new_row_df = pd.DataFrame([results])
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)



In [8]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Random Forest Classifier,n_estimators=100,0.028689,0.600081,0.430368,0.014839,0.918893,[[366989 634]\n [ 31801 479]],188 minutes and 18.76 seconds,POS + RFC


In [9]:
# Logistic Regression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42)

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Evaluate the model on the test set
log_reg_results = evaluate_model(log_reg, X_train, y_train, X_test, y_test, model_name="Logistic Regression", parameters='binary', comments="POS + Logistic Regression")

# Create a DataFrame with the results of Logistic Regression
new_row_log_reg = pd.DataFrame([log_reg_results])

# Concatenate the results to the existing results DataFrame
results_df = pd.concat([results_df, new_row_log_reg], ignore_index=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Random Forest Classifier,n_estimators=100,0.028689,0.600081,0.430368,0.014839,0.918893,[[366989 634]\n [ 31801 479]],188 minutes and 18.76 seconds,POS + RFC
1,Logistic Regression,binary,6.2e-05,0.616665,0.142857,3.1e-05,0.919268,[[367617 6]\n [ 32279 1]],0 minutes and 7.35 seconds,POS + Logistic Regression


## TF-IDF

- TF-IDF (Term Frequency-Inverse Document Frequency) measures the importance of a word in a document relative to a collection of documents, helping the classifier by highlighting words that are frequent in a specific document but less common across the entire dataset, potentially indicating more discriminative or informative terms for classification of toxic comments.

### 1. Logistic regression

In [11]:
# Assuming 'comment_text' is the column containing text data and 'toxic' is the target variable
text_data = data['comment_text']
target = data['toxic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, target, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, lowercase=True)  # Convert text to lowercase during vectorization

# Fit and transform on training data, transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train Logistic Regression model
lr = LogisticRegression(max_iter=1000)  # You can set different parameters
lr.fit(X_train_tfidf, y_train)


In [12]:
# Call evaluate_model function
lr_results = evaluate_model(lr, X_train_tfidf, y_train, X_test_tfidf, y_test, model_name="Logistic Regression", parameters='max_iter=1000', comments='TF-IDF + LR')

# Convert the dictionary of results into a DataFrame
lr_results_df = pd.DataFrame([lr_results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, lr_results_df], ignore_index=True)

In [13]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Random Forest Classifier,n_estimators=100,0.028689,0.600081,0.430368,0.014839,0.918893,[[366989 634]\n [ 31801 479]],188 minutes and 18.76 seconds,POS + RFC
1,Logistic Regression,binary,6.2e-05,0.616665,0.142857,3.1e-05,0.919268,[[367617 6]\n [ 32279 1]],0 minutes and 7.35 seconds,POS + Logistic Regression
2,Logistic Regression,max_iter=1000,0.537413,0.911949,0.78071,0.409727,0.943064,[[363908 3715]\n [ 19054 13226]],0 minutes and 28.31 seconds,TF-IDF + LR


### 2. RFC

In [14]:
# Initialize and train Random Forest Classifier model
rfc = RandomForestClassifier(n_estimators=10, random_state=42)  # You can set different parameters
rfc.fit(X_train_tfidf, y_train)

In [15]:

# Call evaluate_model function for RFC
rfc_results = evaluate_model(rfc, X_train_tfidf, y_train, X_test_tfidf, y_test, model_name="Random Forest Classifier", parameters='n_estimators=10', comments='TF-IDF + RFC')

# Convert the dictionary of results into a DataFrame
rfc_results_df = pd.DataFrame([rfc_results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, rfc_results_df], ignore_index=True)

In [16]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Random Forest Classifier,n_estimators=100,0.028689,0.600081,0.430368,0.014839,0.918893,[[366989 634]\n [ 31801 479]],188 minutes and 18.76 seconds,POS + RFC
1,Logistic Regression,binary,6.2e-05,0.616665,0.142857,3.1e-05,0.919268,[[367617 6]\n [ 32279 1]],0 minutes and 7.35 seconds,POS + Logistic Regression
2,Logistic Regression,max_iter=1000,0.537413,0.911949,0.78071,0.409727,0.943064,[[363908 3715]\n [ 19054 13226]],0 minutes and 28.31 seconds,TF-IDF + LR
3,Random Forest Classifier,n_estimators=10,0.466549,0.842187,0.762223,0.336152,0.93795,[[364238 3385]\n [ 21429 10851]],17 minutes and 28.47 seconds,TF-IDF + RFC
