### Baseline Model for a toxic classifier

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier


In [None]:
df = pd.read_csv('../data/merged_data.csv')
data = df[['comment_text','toxic']]

In [None]:
#columns

data.columns

In [None]:
print("Dataset contains {} instances of {} variables.".format(data.shape[0], data.shape[1]))

print(
    "It contains {} toxic messages ({:.1%} of all).".format(
        data[data.toxic == 1].shape[0],
        data[data.toxic == 1].shape[0] / data.shape[0],
    )
)

In [None]:
#split the data in train and test

X_train, X_test, Y_train, Y_test = train_test_split(data['comment_text'], data['toxic'], random_state=42)



In [None]:
#Check Data Sizes
print(X_train.shape, Y_train.shape)


## CountVectorizer or Bag of Words - Uni-grams

In [None]:
#Fit the CountVectorizer to the training data

vect = CountVectorizer(binary=True).fit(X_train)

# transform the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)


In [None]:
# Train the model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_vectorized, Y_train)

In [None]:
# Transform the test data
X_test_vectorized = vect.transform(X_test)
#Do predictions
predictions = model.predict(X_test_vectorized)
#Evaluation
print(roc_auc_score(Y_test, predictions))


## CountVectorizer or Bag of Words - Bi-grams

In [None]:
#Fit the CountVectorizer to the training data

vect = CountVectorizer(binary=True, ngram_range=(1,2)).fit(X_train)

# transform the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)

# Train the model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_vectorized, Y_train)

# Transform the test data
X_test_vectorized = vect.transform(X_test)
#Do predictions
predictions = model.predict(X_test_vectorized)
#Evaluation
print(roc_auc_score(Y_test, predictions))

## CountVectorizer or Bag of Words - Tri-grams

In [None]:
#Fit the CountVectorizer to the training data

vect = CountVectorizer(binary=True, ngram_range=(1,3)).fit(X_train)

# transform the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)

# Train the model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_vectorized, Y_train)

# Transform the test data
X_test_vectorized = vect.transform(X_test)
#Do predictions
predictions = model.predict(X_test_vectorized)
#Evaluation
print(roc_auc_score(Y_test, predictions))

In [None]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names_out())

# Sort the coefficients from the model (from lowest to highest values)
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print("Smallest Coefs:\n{}\n".format(feature_names[sorted_coef_index[:10]]))
print("Largest Coefs: \n{}".format(feature_names[sorted_coef_index[:-11:-1]]))

## TF-IDF

**Term Frequency - Inverse Document Frequency**

Main idea: It measure how important a word is to a document in a set of texts . 

Term Frequency (TF): This is the number of times a word appears in a document, divided by the total number of words in that document. It gives higher value to terms that appear more frequently in a particular document.

Inverse Document Frequency (IDF): This measures the importance of the term across the corpus. It is calculated as the logarithm of the number of documents divided by the number of documents that contain the word. This means common words like 'the', which appear in many documents, will have a lower IDF.

In [None]:
# Initialize the TfidfVectorizer with min_df
tfidf_vect = TfidfVectorizer(min_df=30) 

# Fit and transform the training data to a document-term matrix
X_train_tfidf = tfidf_vect.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vect.transform(X_test)


In [None]:
#initialize and train the model with a the tfidf data_transformation
model = LogisticRegression(max_iter=1500)
model.fit(X_train_tfidf, Y_train)


In [None]:
# Evaluate the model
predictions = model.predict(X_test_tfidf)
print(roc_auc_score(Y_test, predictions))


## Dummy_classifier 

In [None]:
#First we initialize the dummy_clf
dummy_clf = DummyClassifier(strategy="most_frequent")

#we train it
dummy_clf.fit(X_train, Y_train)

#we evaluate it
dummy_predictions = dummy_clf.predict(X_test)
print(roc_auc_score(Y_test, dummy_predictions))
