# Feature Engineering

## Importing

In [61]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

## Data load and %

In [51]:
data = pd.read_csv('data/train.csv', nrows=1000)

# Drop rows with null values in comment_text
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]

# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)

# Just remains toxic and comment_text
df_train_small = df_train.drop(['target'], axis=1)

### Choose % of the data load to use

In [52]:
# Using only 5% of dataset
percentage = 100
df_train_small = df_train_small.sample(frac=percentage / 100, random_state=42)

## Evaluation func

In [53]:
# initialize dataframe that will include the results
results_table = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    # predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # don't forget to append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

# SVM - Support Vector Machine model

### **1. Text Preprocessing techniques**

- **Tokenization**: Splitting text into sentences, words, or other units.
- **Normalization**: Converting text to a standard form (e.g., lowercasing).
- **Stemming and Lemmatization**: Reducing words to their base or root form.
- **Stop Word Removal**: Eliminating common words that add little value in analysis.
- **Handling Special Characters and Punctuation**.

### **2. Feature Extraction techniques**

- **Bag of Words (BoW)**: Represents text data as a bag of words (ignoring sequence/order).
- **TF-IDF (Term Frequency-Inverse Document Frequency)**: Reflects how important a word is to a document in a collection.
- **Word Embeddings**: Vector representations of words (e.g., Word2Vec, GloVe) that capture semantic meanings.
- **Contextual Embeddings**: Advanced embeddings from models like BERT that consider the context of words

### Split train-test

In [56]:
# Separate data into features (X) and labels (y)
X = df_train_small['comment_text']
y = df_train_small['toxic']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## TFDIF - SVM

In [57]:
# apply tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=8000)
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='linear', probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="SVM_tfidf" )

  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
results_table

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,SVC,,0.0,0.5,0.0,0.0,0.945,[[189 0]\n [ 11 0]],0 minutes and 0.45 seconds,SVM_tfidf


In [60]:
y_scores = svm_model.decision_function(X_test_vectorized)
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

print("ROC Curve and AUC:")
print(f"ROC-AUC: {roc_auc}")

ROC Curve and AUC:
ROC-AUC: 0.6714766714766714
