In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from time import time

In [3]:
transcriptions = pd.read_csv('data/transcriptions_with_sex.csv', sep=',')

In [5]:
for model_name in ["LogisticRegression", "SVC", "RandomForestClassifier", "GradientBoostingClassifier", "MultinomialNB", "KNeighborsClassifier"]:
    start = time()

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(transcriptions['prediction'])
    y = transcriptions['sex']

    loo = LeaveOneOut()

    accuracies = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = eval(model_name)()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

    mean_accuracy = sum(accuracies) / len(accuracies)
    print(f"Précision de {model_name}: {round(100*mean_accuracy,2)}% / Temps: {round(time() - start,2)}s")


Précision de LogisticRegression: 79.67% / Temps: 4.19s
Précision de SVC: 80.08% / Temps: 2.97s
Précision de RandomForestClassifier: 78.01% / Temps: 104.03s
Précision de GradientBoostingClassifier: 78.42% / Temps: 511.11s
Précision de MultinomialNB: 78.84% / Temps: 0.87s
Précision de KNeighborsClassifier: 77.59% / Temps: 1.76s
