In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Loading the Jigsaw challenge training data
train_file_path = 'train_clean.csv'
train_df = pd.read_csv(train_file_path)

In [5]:
# Create an additional category for comments with no toxic labels
train_df['non_toxic'] = (train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) == 0).astype(int)

In [10]:
# Preprocessing: Vectorizing the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(train_df['comment'])
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [14]:
X

<159571x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 5926788 stored elements in Compressed Sparse Row format>

In [1]:
import pickle

# Chargement du modèle de régression logistique
with open('logreg_model.pkl', 'rb') as file:
    loaded_logreg_model = pickle.load(file)

# Chargement du modèle SVM
with open('svm_model.pkl', 'rb') as file:
    loaded_svm_model = pickle.load(file)

# Chargement du vectoriseur TF-IDF
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf_vectorizer = pickle.load(file)

In [13]:
def classify_comment_into_df(tfidf_vectorizer, logreg_model, svm_model, categories):
    # Taking user input for the comment
    comment = input("Enter your comment: ")

    # Preprocessing the input comment using the same TF-IDF vectorizer
    processed_comment = tfidf_vectorizer.transform([comment])

    # Applying the trained models to the processed comment
    pred_logreg = logreg_model.predict(processed_comment)
    pred_svm = svm_model.predict(processed_comment)

    # Creating a DataFrame for the predictions
    data = []
    for category, pred_lr, pred_svm in zip(categories, pred_logreg[0], pred_svm[0]):
        data.append({'Category': category, 
                     'Logistic Regression': bool(pred_lr), 
                     'SVM': bool(pred_svm)})

    predictions_df = pd.DataFrame(data)
    return predictions_df

# Example usage of the function
categories = y.columns  # The categories of toxicity

predictions_df = classify_comment_into_df(loaded_tfidf_vectorizer, loaded_logreg_model, loaded_svm_model, categories)
predictions_df

Enter your comment: you mother fucker, fuck you 


Unnamed: 0,Category,Logistic Regression,SVM
0,toxic,True,True
1,severe_toxic,True,True
2,obscene,True,True
3,threat,False,False
4,insult,True,True
5,identity_hate,False,False
