# Get data Ready!

## N_rows and percentage

In [1]:
# Decide how many lines you want to run and the % of it you want to use
# Total lines in the file: 360835
n_rows = 360000
percentage_rows = 10

## Importing and setting up

In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm # progress bar

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import seaborn as sns

# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import spacy # (object oriented)
import nltk # natural language tool kit (string oriented)
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from gensim.models import Word2Vec

tqdm.pandas()

#nltk.download('stopwords')
#nltk.download('wordnet')

#Run this lines only once
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_md
#!python -m spacy download en_core_web_lg #587.7 MB

In [3]:
# Load data
data = pd.read_csv('data/undersampled_data_60_40.csv', nrows=n_rows)
# copy data
df = data.copy()
# Using only # % of datset
df = df.sample(frac=percentage_rows / 100, random_state=42)
# before train_split:
df = df.dropna(subset=['stopwords_punct_lemma'])

In [4]:
# Get counts and percentages
counts, percentages = df['toxic'].value_counts(), df['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

       Count Percentage
toxic                  
0      21523     59.87%
1      14426     40.13%


In [5]:
df.head()

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str
76637,If only the clintons were held to the same sta...,0,clinton hold standard press hold trump agree,[-1.2594286 1.6080885 -2.1067002 2.376314...,"[('If', 'IN'), ('only', 'RB'), ('the', 'DT'), ...",IN RB DT NNS VBD VBN TO DT JJ NN IN DT NN VBZ ...
64372,Time to bake the doughboy.,0,time bake doughboy,[ 2.0744269e-01 2.2579332e-01 -4.8536677e-02 ...,"[('Time', 'NN'), ('to', 'TO'), ('bake', 'VB'),...",NN TO VB DT NN .
252387,"Yeah, but they're white. It's totally cool and...",1,yeah white totally cool okay discriminate whit...,[-2.457021 0.44777998 -3.158751 0.608898...,"[('Yeah', 'UH'), (',', ','), ('but', 'CC'), ('...","UH , CC PRP VBP JJ . PRP VBZ RB JJ CC JJ TO VB..."
251573,"I agree with you, but his choice of language i...",1,agree choice language damnable find way expres...,[-0.39756107 1.039007 -0.66458005 -1.212819...,"[('I', 'PRP'), ('agree', 'VBP'), ('with', 'IN'...","PRP VBP IN PRP , CC PRP$ NN IN NN VBZ JJ . PRP..."
318742,The false theory of evolution is facts? I am...,0,false theory evolution fact advocate school...,[-0.7768647 0.87667793 -1.2914572 -0.622957...,"[('The', 'DT'), ('false', 'JJ'), ('theory', 'N...",DT JJ NN IN NN VBZ NNS . PRP VBP VBG IN NN NN ...


## Evaluation Func that fits, predict and evaluate

In [6]:
# initialize dataframe that will include the results
results_table = pd.DataFrame()

In [7]:
def evaluate_model(model, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # don't forget to append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

In [8]:
def evaluate_model_clean(model, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    #predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    #roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        #'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # don't forget to append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

## Conversion of spacy vector from string to numerical value

In [1]:
def convert_string_to_array(string):
    try:
        # Trying to convert using ast.literal_eval
        return np.array(ast.literal_eval(string))
    except:
        # If ast.literal_eval fails, use an alternative method
        # Remove brackets and split the string by space
        cleaned_string = string.strip('[]')
        split_strings = cleaned_string.split()
        # Convert each split string to float and then to a numpy array
        return np.array([float(i) for i in split_strings])

In [10]:
'''# Apply the function
df['vector_spacy'] = df['vector_spacy'].apply(convert_string_to_array)
# Check the result
print(df['vector_spacy'][0][0], type(df['vector_spacy'][0][0]))'''

"# Apply the function\ndf['vector_spacy'] = df['vector_spacy'].apply(convert_string_to_array)\n# Check the result\nprint(df['vector_spacy'][0][0], type(df['vector_spacy'][0][0]))"

# -------------

# OK - SVM - stopwords_punct_lemma - Tfidf

In [11]:
X_stop = df['stopwords_punct_lemma']
y_stop = df['toxic']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X_stop)

# Split the data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_tfidf, y_stop, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Call the evaluate_model function
results_table = evaluate_model_clean(model, X_train_s, y_train_s, X_test_s, y_test_s,
                                     results_table, model_name="SVC",parameters="", 
                                     comments="SVM - stopwords_punct_lemma - TFIDf")

In [12]:
results_table

Unnamed: 0,Name,Parameters,F1-Score,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,SVC,,0.805718,0.858594,0.758978,0.852573,[[3932 362]\n [ 698 2198]],6 minutes and 19.54 seconds,SVM - stopwords_punct_lemma - TFIDf


# OK -  SVM -  stopwords_punct_lemma - word2vec

In [None]:
sentences = [text.split() for text in df['stopwords_punct_lemma']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Transform each text into an average Word2Vec vector
word2vec_vectors = []
for text in sentences:
    vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if vectors:
        text_vector = np.mean(vectors, axis=0)
        word2vec_vectors.append(text_vector)
    else:
        # Handle cases where there are no words found in the Word2Vec model
        word2vec_vectors.append(np.zeros(word2vec_model.vector_size))

# Convert the list of Word2Vec vectors into a matrix
X_word2vec = np.vstack(word2vec_vectors)

# Split the data into training and testing sets
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y_stop, 
                                                                    test_size=0.2, random_state=42)

# Initialize the model
svm_model_w2v = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Call the evaluate_model function for Word2Vec
results_table = evaluate_model_clean(svm_model_w2v, X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v,
                                     results_table, model_name="SVC",parameters="",
                                     comments="SVM - stopwords_punct_lemma - Word2Vec")

In [None]:
results_table

# OK - SVM - stopwords_punct_lemma - count vec


In [None]:
X_c = df['stopwords_punct_lemma']
y_c = df['toxic']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Transform text data into CountVectorized features
X_count = count_vectorizer.fit_transform(X_c)

# Split the data into training and testing sets
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_count, y_c, test_size=0.2, random_state=42)

# Initialize the model
svm_model_c = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Call the evaluate_model function for CountVectorizer
results_table = evaluate_model_clean(svm_model_c, X_train_c, y_train_c, X_test_c, y_test_c,
                                     results_table, model_name="SVM", parameters="", 
                                     comments="SVM - stopwords_punct_lemma - CountVec")

In [None]:
results_table

# OK - SVM + POS + count vectorizer

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['pos_tags_str'])
y = df['toxic']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
svm_model = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Use the evaluate_model function 
results_table = evaluate_model_clean(svm_model, X_train, y_train, X_test, y_test,
                          results_table,model_name="SVM",parameters="",
                          comments="POS + RFC")

In [None]:
results_table

# OK - SVM + POS + TFIDF

In [None]:
vectorizer = TfidfVectorizer()
X_pos_tf = vectorizer.fit_transform(df['pos_tags_str'])
y_pos_tf = df['toxic']

In [None]:
# Split the data into training and testing sets
X_train_pos_tf, X_test_pos_tf, y_train_pos_tf, y_test_pos_tf = train_test_split(X_pos_tf, y_pos_tf, test_size=0.2, random_state=42)

# Initialize the model
svm_model = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Use the evaluate_model function to assess the model
results_table = evaluate_model_clean(svm_model, X_train_pos_tf, y_train_pos_tf, X_test_pos_tf, y_test_pos_tf,
                         results_table, model_name="SVM", parameters="", 
                         comments="POS + using TF-IDF")

In [None]:
results_table

# OK - TF-IDF - SVM ( standard + best param 1 and 2)

### Train - test

In [None]:
# Separate data into features (X) and labels (y)
X = df['comment_text']
y = df['toxic']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### SVM standart + TFIDF

In [None]:
# apply tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=8000, lowercase=True, stop_words='english')

X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='linear')#, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model_clean(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="SVM_tfidf" )

In [None]:
results_table

### SVM best param 1 + TFIDF

In [None]:
#Best Parameters 1: {'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_df': 0.85, 'tfidf__max_features': 3000, 'tfidf__min_df': 5}

# apply tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.85, lowercase=True, stop_words='english')

X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.1)#, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model_clean(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Best 1 SVM_tfidf" )

In [None]:
results_table

### SVM best param 2 + TFIDF

In [None]:
#Best Parameters 2: {'svm__C': 10, 'svm__gamma': 0.7, 'svm__kernel': 'rbf', 'tfidf__max_df': 0.85, 'tfidf__max_features': 3000, 'tfidf__min_df': 10}

# apply tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000, min_df=10, max_df=0.85, lowercase=True, stop_words='english')

X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model_clean(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Best 2 SVM_tfidf" )

In [None]:
results_table

# OK - SVM + spacy vectorizer (takes too long to run)

In [None]:
corpus_vect = df['vector_spacy'].str.strip('[]').str.split(expand=True)
corpus_vect = corpus_vect.astype('float')

df['new_vector_spacy'] = corpus_vect.values.tolist()

In [None]:
X_sp = np.array(df['new_vector_spacy'].tolist())
y_sp = df['toxic']

# Split the data into training and testing sets
X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(X_sp, y_sp, test_size=0.2, random_state=42)

# Initialize the model
svm_model = SVC(kernel='rbf', C=10, gamma=0.7)#, probability=True)

# Call the evaluate_model
results_table = evaluate_model_clean(svm_model, X_train_sp, y_train_sp, X_test_sp, y_test_sp, 
                                     results_table, model_name="SVM", parameters="", 
                                     comments="SVM - SpaCy vec")

In [None]:
results_table

# -------------------

# Working but not being used

## Bag of Words (baseline)

In [None]:
'''
#Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', probability=True, C=10, gamma=0.7,)

# Call the function and store the row in the variable result
results_table = evaluate_model(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Bag - Baseline" )
'''

## Bag of words Binary

In [None]:
'''#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', probability=True, C=10, gamma=0.7,)

# Call the function and store the row in the variable result
results_table = evaluate_model(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Bag - Binary" )'''

## Bag of Words (Binary + Stop Words)

In [None]:
'''stop_words = set(stopwords.words('english'))'''

In [None]:
'''#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True, stop_words=list(stop_words)).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', probability=True, C=10, gamma=0.7,)

# Call the function and store the row in the variable result
results_table = evaluate_model(svm_model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Bag - Binary/StopWords" )'''

## Stemming(Bag of words)

In [None]:
'''# Initializing stemmer and countvectorizer
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test,results_table, parameters="", comments="Stem + bag + svm")'''

## Stemming(Bag of words(stopwords))

In [None]:
'''# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test,results_table, parameters="", comments="Stem + bag + stop")'''

## Stemming with TF - IDF and stopwords

In [None]:
'''stop_words = set(stopwords.words('english'))

# stop_words contains a list of 179 words that we want to remove from our comments

# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
tfidf_analyzer = TfidfVectorizer(min_df=30, stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test,results_table, parameters="", comments="Stem - tfidf - stop")'''

## Lemmatization with Bag of Words

In [None]:
'''# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer().build_analyzer()

def lemmatize_word(doc):
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="", comments="lem + bag")

'''

## Lemmatization with TF-IDF

In [None]:
'''# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = TfidfVectorizer(min_df=30).build_analyzer()

def lemmatize_word(doc):
    
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="", comments="lem - tfidf")
'''

## Lemm with StopWords

In [None]:
'''# Initialization

stop_words = set(stopwords.words('english'))

WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def lemmatize_word(doc):
    
    #In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="", comments="lem - stop")'''

## Word Vectors - Spacy library - Small

In [None]:
'''# This initialize a pre-trained model (the small version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_sm")

# convert words into vectors and Prepare X_train for the function
docs = [nlp(text) for text in X_train]
X_train_word_vectors = [x.vector for x in docs]

# Prepare X_test for the function
docs_test = [nlp(text) for text in X_test]
X_test_word_vectors = [x.vector for x in docs_test]

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_word_vectors, y_train, X_test_word_vectors, y_test,results_table, parameters="", comments="word_vectors_spacy_sm")'''

## Word Vectors - Spacy library - Medium

In [None]:
'''# convert vector string to df and cast all cols as float
corpus_vect = df['vector_spacy'].str.strip('[]').str.split(expand=True)
corpus_vect = corpus_vect.astype('float')'''

In [None]:
'''# This initialize a pre-trained model (the medium version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_md")

# convert words into vectors and Prepare X_train for the function
docs = [nlp(text) for text in X_train]
X_train_word_vectors = [x.vector for x in docs]

# Prepare X_test for the function
docs_test = [nlp(text) for text in X_test]
X_test_word_vectors = [x.vector for x in docs_test]

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_word_vectors, y_train, X_test_word_vectors, y_test,results_table, parameters="", comments="word_vectors_spacy_md")'''

In [None]:
'''# Separate data into features (X) and labels (y)
X = df['vector_spacy']
y = df['toxic']

# Split data into training and test sets
X_train_word_vectors, X_test_word_vectors, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_word_vectors, y_train, X_test_word_vectors, y_test,results_table, parameters="", comments="word_vectors_spacy_md")'''

## Word Vectors - Spacy library - Large

In [None]:
'''# This initialize a pre-trained model (the large version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_lg")

# convert words into vectors and Prepare X_train for the function
docs = [nlp(text) for text in X_train]
X_train_word_vectors = [x.vector for x in docs]

# Prepare X_test for the function
docs_test = [nlp(text) for text in X_test]
X_test_word_vectors = [x.vector for x in docs_test]

# Instantiate
svm_model = SVC(kernel='rbf', C=10, gamma=0.7, probability=True)

# Fit, predict and evaluate
results_table = evaluate_model(svm_model, X_train_word_vectors, y_train, X_test_word_vectors, y_test,results_table, parameters="", comments="word_vectors_spacy_lg")'''

In [None]:
# results_table