In [35]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib.animation as animation

%matplotlib inline


mpl.rcParams['font.size'] = 15
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'

plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']=0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'

sns.set(style="darkgrid", font_scale=1.5)

In [24]:
debate_filenames = ["july1_transcript.csv", "july2_transcript.csv",  \
    "september_transcript.csv", "october_transcript.csv", "november_transcript.csv", "december_transcript.csv", \
    "january_transcript.csv"]

candidates = ['Klobuchar', 'Yang', 'Sanders', 'Biden', 'Harris', \
            'Buttigieg', 'Booker', 'Warren', 'Steyer', 'Gabbard']

def load_debate_file(filename):
    debate_df = pd.read_csv(filename)
    
    regexed_time = debate_df['time'].str.extract(r'\((?P<hours>[0-9]*)[:]*(?P<mins>[0-9]*)[:](?P<seconds>[0-9]*)\)')
    
    def f(row):
        if len(row['mins']) == 0:
            # Means there is no hour. The column hour is actually minutes.
            val = int(row['hours']) * 60 + int(row['seconds'])
        else:
            # Means there is a hour column too.
            val = int(row['hours']) * 60 * 60 + int(row['mins']) * 60 + int(row['seconds'])
        return val
    
    debate_df['seconds_since_start'] = regexed_time.apply(f, axis=1)
    debate_df['duration'] = - debate_df['seconds_since_start'] + debate_df['seconds_since_start'].shift(-1)
    
    
    debate_df['flag'] = np.where(debate_df.name.str.contains('|'.join(candidates)),1,0)
    
    filtered_df = debate_df[debate_df['flag'] == 1]
    #filtered_df['short'] = filtered_df.name.str.contains('|'.join(candidates))
    
    def change_name(name):
        for c in candidates:
            if c in name:
                return c
        return name
    
    filtered_df['name'] = filtered_df['name'].apply(change_name)
    
    return filtered_df

In [36]:
mean_durations = []
total_durations = []


for debate_filename in enumerate(debate_filenames):
    current_df = load_debate_file("../data/" + debate_filename[1])
    
    mean_duration_df = current_df.groupby('name')['duration'].mean().to_dict()
    total_duration_df = current_df.groupby('name')['duration'].sum().to_dict()
    
    mean_durations.append(mean_duration_df)
    total_durations.append(total_duration_df)
    


all_candidates_mean_times = defaultdict(list)
all_candidates_total_times = defaultdict(list)
for i in [0, 2, 3, 4, 5, 6]:
    for candidate in candidates:
        
        if (i == 0) and (candidate not in mean_durations[i].keys()):
            if (candidate in mean_durations[1].keys()):
                all_candidates_mean_times[candidate].append(mean_durations[i + 1][candidate])
                all_candidates_total_times[candidate].append(total_durations[i + 1][candidate])
            else:
                all_candidates_mean_times[candidate].append(0)
                all_candidates_total_times[candidate].append(0)
        else:
            if candidate in mean_durations[i].keys():
                all_candidates_mean_times[candidate].append(mean_durations[i][candidate])
                all_candidates_total_times[candidate].append(total_durations[i][candidate])
            else:
                all_candidates_mean_times[candidate].append(0)
                all_candidates_total_times[candidate].append(0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [123]:
from IPython.display import HTML

labels = ["July 30/31, 2019",  \
    "September 12, 2019", "October 15, 2019", "November 20, 2019", "December 19, 2019", \
    "January 14, 2020"]

def barlist(n): 
    return [times[n] for cand, times in all_candidates_mean_times.items()]

fig, ax = plt.subplots()
fig.set_figwidth(12)
fig.set_figheight(8)

n = len(all_candidates_mean_times['Sanders'])


x = [cand for cand, times in all_candidates_mean_times.items()]

barcollection = plt.bar(x, barlist(0))

def set_axis_style(ax, labels):
    ax.get_xaxis().set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(0, len(labels)))
    ax.set_xticklabels(labels, rotation=90, ha='right')
    ax.set_xlim(0.25, len(labels) + 0.25)
    
    ax.set_ylabel("Mean Time (s)", fontsize=20)
    
    ax.set_yticks(np.arange(0, 50, 5))
    ax.set_yticklabels(np.arange(0, 50, 5))
    
    ax.set_ylim(0, 45)
    
def animate(i):
    y = barlist(i)
    ax.clear()
    set_axis_style(ax, candidates)
    ax.set_title(labels[i])
    for i, b in enumerate(barcollection):
        ax.vlines(x=i, ymin=0, ymax=y[i], color='#70c5ff', alpha=0.4, linewidth=20)
        ax.plot(i, y[i], "o", markersize=20, color='#007ACC', alpha=0.4)
    
    ax.plot([2.5], [45], "o", markersize=0, color='#007ACC', alpha=0)

ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)
    
ani = animation.FuncAnimation(fig, animate, repeat=True, blit=False, frames=n, interval=1000)

fig.tight_layout(rect=[0.1, 0.13, 1, 0.95])
plt.close()

HTML(ani.to_html5_video())


#ani.save('mean_times.mp4')
ani.save('mean_times.gif', writer='imagemagick', fps=3)



In [124]:
def barlist(n): 
    return [times[n] / 60 for cand, times in all_candidates_total_times.items()]


fig = plt.figure(figsize=(12, 8))
fig, ax = plt.subplots()
fig.set_figwidth(12)
fig.set_figheight(8)

n = len(all_candidates_total_times['Sanders'])

x = [cand for cand, times in all_candidates_total_times.items()]

barcollection = plt.bar(x, barlist(0))


def set_axis_style(ax, labels):
    ax.get_xaxis().set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(0, len(labels)))
    ax.set_xticklabels(labels, rotation=90, ha='right')
    ax.set_xlim(0.25, len(labels) + 0.25)
    ax.set_ylabel("Total Time (s)", fontsize=20)
    
    ax.set_ylim(0, 25)
    
def animate(i):
    y = barlist(i)
    ax.clear()
    set_axis_style(ax, candidates)
    ax.set_title(labels[i])
    for i, b in enumerate(barcollection):
        #b.set_height(y[i])
        ax.vlines(x=i, ymin=0, ymax=y[i], color='#70c5ff', alpha=0.4, linewidth=20)
        ax.plot(i, y[i], "o", markersize=20, color='#007ACC', alpha=0.4)
        
    ax.plot([2.5], [25], "o", markersize=0, color='#007ACC', alpha=0)
        
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)

ani = animation.FuncAnimation(fig, animate, repeat=True, blit=False, frames=n, interval=1000)


fig.tight_layout(rect=[0.1, 0.13, 1, 0.95])
plt.close()


# Show the animation
HTML(ani.to_html5_video())

ani.save('total_times.gif', writer='imagemagick', fps=3)

<Figure size 864x576 with 0 Axes>

In [None]:
barlist(1)

In [None]:
barlist(2)

In [112]:
np.arange(0, 50, 5)

array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45])