In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold, LeaveOneGroupOut
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from google.colab import files
import pandas as pd

uploaded = files.upload()

file_name = list(uploaded.keys())[0]

df = pd.read_csv(file_name)

# Basic preprocessing
stopwords = set(["a", "an", "the", "this", "is", "of", "with", "and", "or", "for", "on", "to", "in", "at", "as", "by", "it", "from"])
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return " ".join([word for word in text.split() if word not in stopwords])

df['processed'] = df['Sentence'].apply(preprocess)
df['Label'] = df['Quality'].apply(lambda x: 'show' if x == 1.0 else 'tell')
X = df['processed']
y = df['Label']
groups = df['Plot_Name']


Saving data_stories_one_shot.csv to data_stories_one_shot.csv


In [2]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=kf)
    cv_results[name] = scores

cv_df = pd.DataFrame(cv_results)
cv_df.loc['Mean'] = cv_df.mean()
cv_df


Unnamed: 0,Logistic Regression,SVM,Naive Bayes,Random Forest
0,0.884615,0.923077,0.884615,0.923077
1,0.884615,0.923077,0.884615,0.923077
2,0.769231,0.807692,0.769231,0.807692
3,0.961538,1.0,0.961538,1.0
4,0.923077,0.923077,0.923077,0.923077
Mean,0.884615,0.915385,0.884615,0.915385


In [3]:

logo = LeaveOneGroupOut()
logo_results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=logo.split(X, y, groups=groups))
    logo_results[name] = scores

logo_df = pd.DataFrame(logo_results)
logo_df.loc['Mean'] = logo_df.mean()
logo_df


Unnamed: 0,Logistic Regression,SVM,Naive Bayes,Random Forest
0,0.9,1.0,0.9,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,0.727273,0.818182,0.727273,0.818182
4,0.9,1.0,0.9,1.0
5,1.0,1.0,1.0,1.0
6,0.923077,0.923077,0.923077,0.923077
7,1.0,1.0,1.0,1.0
8,0.75,0.833333,0.75,0.833333
9,0.909091,0.909091,0.909091,0.909091
