# Feature Engineering


## Imports

In [2]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize

from tqdm import tqdm



## Data Extract

In [2]:
data_train = pd.read_csv('data/train.csv')


In [3]:
len(data_train)

1804874

In [4]:
data_train['toxic'] = (data_train['target'] >= 0.5).astype(int)

In [5]:
data_train.dropna(subset=['comment_text'], inplace=True)


In [6]:
# List of columns to keep
columns_to_keep = ['comment_text', 'toxic']
data_train = data_train[columns_to_keep]

In [7]:
len(data_train)

1804871

## Preprocess

### Part of Speech

In [12]:
# Function to perform POS tagging
def pos_tagging(text):
    tokens = word_tokenize(text)  # Tokenize the text
    pos_tags = nltk.pos_tag(tokens)  # Perform POS tagging
    return pos_tags

In [13]:
#create new column for pos

data_train['pos_tags'] = data_train['comment_text'].apply(pos_tagging)

In [15]:
# Use tqdm to track the progress of apply function
tqdm.pandas()

In [16]:
# Flatten the list of tuples into a string format
data_train['pos_tags_str'] = data_train['pos_tags'].apply(lambda tags: ' '.join([tag[1] for tag in tags]))


In [17]:
data_train.columns

Index(['comment_text', 'toxic', 'pos_tags', 'pos_tags_str'], dtype='object')

In [18]:
# save to csv
data_train.to_csv('data/pos_data.csv', index=False)