In [15]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np

In [51]:
def change_min_format(minute):
    if len(minute) <= 6:
        return "00:" + minute
    else:
        return minute

df = pd.read_csv('./Data/2020/us_election_2020_1st_presidential_debate.csv')
df.dropna(inplace=True)

#Parse 'minute' column to DateTime format
df['minute'] = df['minute'].apply(change_min_format)
df['minute'] = pd.to_datetime(df['minute'])

## Can we predict who says what given a speech in a debate?

In [122]:
df.speaker.value_counts()

President Donald J. Trump    314
Vice President Joe Biden     249
Chris Wallace                225
Name: speaker, dtype: int64

In [125]:
#Encode speaker to integer
speaker_dict = {'President Donald J. Trump':0, 'Vice President Joe Biden':1, 'Chris Wallace':2} 

def speaker_encoding(speaker):
    return speaker_dict[speaker]

df['speaker_enc'] = df['speaker'].apply(speaker_encoding)

In [157]:
# Training framework
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

tfidf = TfidfVectorizer(lowercase=True)

X = tfidf.fit_transform(df.text)
y = df['speaker_enc']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

model = LinearSVC(multi_class='ovr', random_state=42)
model.fit(X_train, y_train)
(model.predict(X_test) == y_test).mean()

0.7341772151898734