In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from time import time

In [16]:
transcriptions = pd.read_csv('transcriptions_with_sex.csv', sep=',')

In [30]:
for model_name in ["LogisticRegression", "SVC", "RandomForestClassifier", "GradientBoostingClassifier", "MultinomialNB", "MLPClassifier", "KNeighborsClassifier"]:
    start = time()

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(transcriptions['prediction'])
    y = transcriptions['sex']

    loo = LeaveOneOut()

    accuracies = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = eval(model_name)()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

    mean_accuracy = sum(accuracies) / len(accuracies)
    print(f"Mean Accuracy of {model_name}: {round(100*mean_accuracy,2)}% / Time spent : {round(time() - start,2)}s")


Mean Accuracy of LogisticRegression: 79.67% / Time spent : 4.99s
Mean Accuracy of SVC: 80.08% / Time spent : 5.27s
Mean Accuracy of RandomForestClassifier: 78.42% / Time spent : 111.95s
Mean Accuracy of GradientBoostingClassifier: 78.01% / Time spent : 593.37s
Mean Accuracy of MultinomialNB: 78.84% / Time spent : 1.01s




In [23]:
type(model).__name__

'LogisticRegression'