### Contents:
* Preparing the ground
    * Importing Libs and Datasets
    * Data check and preprocessing
    * TF-IDF
* Feature engineering

* Conclusion

### Importing libs and datasets

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import plotly.express as px
# import seaborn as sns
import os
import string
import re
# import warnings
# warnings.filterwarnings("ignore")
from wordcloud import WordCloud, STOPWORDS
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 500)

In [10]:
df = pd.read_csv("D:/Programming/DB's/Toxic_database/tox_train.csv")
#10 sec loading

### Data check and preprocessing

# heatmap target vs features??

In [12]:
df.drop_duplicates(keep=False,subset=['comment_text'],inplace=True)#no dup

* Dataset duplicates are removed.

In [16]:
print(df.comment_text.tail(1))
df.reset_index(drop=True,inplace=True)
print(df.comment_text.tail(1))

1804873    Students defined as EBD are legally just as di...
Name: comment_text, dtype: object
1770642    Students defined as EBD are legally just as di...
Name: comment_text, dtype: object


* Dropping empty ID's by resetting indexation. Now the last ID is the same as the number of comments.

In [19]:
df['target_class']=(df['target']>=0.5).map(int)#if more than .5 - than toxic.

* Creating Rough toxic classification based on 0.5 target threshold to count clean and toxic comments (class imbalance).

### Training

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_text'], df['target_class'],
                                                    stratify= df['target_class'], 
                                                    test_size=0.15)

* Split to train/test/validation (X,y, by='y')

In [28]:
y_train.value_counts()

0    1384825
1     120221
Name: target_class, dtype: int64

* There are 1384825 false samples & 120221 True ones.

1. X_train - fit_transform
2. X_test - transform

In [30]:
tfidf=TfidfVectorizer(ngram_range=(1,1),max_df=0.8,min_df=10)

In [31]:
tfidf_X_train=tfidf.fit_transform(X_train)

In [32]:
tfidf_X_train

<1505046x58647 sparse matrix of type '<class 'numpy.float64'>'
	with 58154419 stored elements in Compressed Sparse Row format>

* It's 1505046x58647 sparse matrix. It has quite a lot of features, should be about 1000.

In [34]:
tfidf_X_test=tfidf.transform(X_test)

In [35]:
tfidf_X_test

<265597x58647 sparse matrix of type '<class 'numpy.float64'>'
	with 10233698 stored elements in Compressed Sparse Row format>

* It's 265597x58647 sparse matrix. It has same amount of features, again, it should be about 1000.

In [78]:
mean = df['count_word'].mean()
std = df['count_word'].std()
(df['count_word'] - mean)/std

0         -0.711242
1         -0.646214
2         -0.776270
3         -0.754594
4         -0.949679
             ...   
1770638   -0.321072
1770639   -0.797946
1770640   -0.906327
1770641    0.134125
1770642    1.629775
Name: count_word, Length: 1770643, dtype: float64

In [79]:
(x - mean) / std

NameError: name 'x' is not defined

* ???

### * Reminder * 
* training pipeline: data->preprocessing->f/e->model training->evaluetion ->save model+features+metrics (training flow )
* це симуляція inference pipeline 
* inference pipeline: data+load model+load features->preprocessing->prediction  (production flow)

### Feature engineering

In [None]:
df_temp = X_train.to_frame()# To show up, it must be very small matrix, or cutted one.
df_temp

* It must show ok features from EDA step.

In [39]:
#Word count in each comment:
df.loc[:,'count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
df.loc[:,'count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
df.loc[:,'count_letters'] = df["comment_text"].apply(lambda x: len(str(x)))# воно рахує з пробілами... так і треба?
#punctuation count
df.loc[:,"count_punctuations"] = df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))# I'll -всередині це ж не пунктуація?

In [41]:
#upper case words count
df.loc[:,"count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
df.loc[:,"count_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
df.loc[:,"count_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
#Average length of the words
df.loc[:,"mean_word_len"] = df["comment_text"].apply(lambda x: round(np.mean([len(w) for w in str(x).split()]),2))

In [43]:
#derived features
#Word count percent in each comment:
df.loc[:,'word_unique_percent']=df.loc[:,'count_unique_word']*100/df['count_word']
#Punct percent in each comment:
df.loc[:,'punct_percent']=df.loc[:,'count_punctuations']*100/df['count_word']

In [58]:
# direct_features=list(df.columns).remove('comment_text')
print(list(df.columns).remove('comment_text'))
print(list(df.columns))

None
['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'target_class', 'count_word', 'count_unique_word', 'count_letters', 'count_punctuations', 'count_words_upper', 'count_words_title', 'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent']


In [64]:
direct_features = (list(df.columns))
direct_features.remove('comment_text')

['id', 'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'target_class', 'count_word', 'count_unique_word', 'count_letters', 'count_punctuations', 'count_words_upper', 'count_words_title', 'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent']


In [77]:
x_train_direct_f=df[direct_features].values

In [None]:
# array + sparse matrix

#  train = tfidf_X_train  + x_train_direct_f

In [None]:
# Scale norm, minmax

In [None]:
train_sc = scale.fit_transfrom(train

In [None]:
# svm
svm.fit( train_sc, y_train)
pred_sc = svm.predict(test_sc)

In [80]:
mae=10 
print("  MAE: %s" % mae)

  MAE: 10


In [None]:
pred_sc

In [None]:
scale.inverse(pred_sc)

In [None]:
# auc, conf_matrix, report - metrics

In [None]:
# df_temp
df_temp2=df.loc[:,['id','target','comment_text','target_class']]
df_temp=pd.concat([df_temp2, df_temp], axis=1, join='inner')
df_temp

### CSV Export

In [None]:
os.makedirs('D:/Programming/Repositories/toxic_detection_classification/Model', exist_ok=True)
df_temp.to_csv('D:/Programming/Repositories/toxic_detection_classification/Model/tox_train_featurefull')