In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('ums_viz.csv')
df_perf = pd.read_pickle('descriptions.pkl')
df_perf.columns.values[0] = 'perf_name'

df['perf_name'] = df['perf_name'].apply(lambda x: x.strip())
df_perf['perf_name'] = df_perf['perf_name'].apply(lambda x: x.strip())

df['per_seat'] = df['tck_amt']/df['num_seats']
df_group = df.groupby('perf_name')

# Add a whole bunch of new performance-level features

# mean ticket amount, number of seats, price per seat
df_tmp = df_group['tck_amt', 'num_seats', 'per_seat'].mean().reset_index()
df_tmp = df_tmp.rename(columns = {'tck_amt': 'mean_tck_amt', 'num_seats': 'mean_num_seats', 'per_seat': 'mean_per_seat'})

df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

# max ticket amount, number of seats, price per seat
df_tmp = df_group['tck_amt', 'num_seats', 'per_seat'].max().reset_index()
df_tmp = df_tmp.rename(columns = {'tck_amt': 'max_tck_amt', 'num_seats': 'max_num_seats', 'per_seat': 'max_per_seat'})
df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

# total number of seats
df_tmp = df_group['num_seats'].count().reset_index()
df_tmp = df_tmp.rename(columns = {'num_seats': 'count_tck_amt'})
df_perf = pd.merge(df_perf, df_tmp, on='perf_name')

print df_perf

ImportError: No module named indexes.base

In [5]:
# Extract bigram matrix
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

vect = CountVectorizer(ngram_range=(1, 1), stop_words = ENGLISH_STOP_WORDS, min_df = 5)
X = vect.fit_transform(df_perf.description.values)

print X.shape

NameError: name 'df_perf' is not defined

In [None]:
# Latent Dirichlet Allocation

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics=5, max_iter=10)
y = lda.fit_transform(X)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print

print_top_words(lda, vect.get_feature_names(), 20)

df_perf['topic'] = np.argmax(y, 1)

In [None]:
df_perf.groupby('topic').describe()

## Plots

### Distribution of the number of performances for each topic

In [None]:
import seaborn as sns
topic_range = range(5)
sns.factorplot(x = 'topic', data = df_perf, kind = 'count', palette="BuPu", size=5, aspect=1.5, order = topic_range)

### Mean ticket amount per seat for each topic

In [None]:
sns.set(style="ticks", palette="muted", color_codes=True)
ax = sns.boxplot(x="topic", y="mean_per_seat", data=df_perf, whis=np.inf, color="c", order=topic_range)
sns.stripplot(x="topic", y="mean_per_seat", data=df_perf, jitter=True, size=3, color=".3", linewidth=0, order=topic_range)

### Max ticket amount per seat for each topic

In [None]:
sns.set(style="ticks", palette="muted", color_codes=True)
ax = sns.boxplot(x="topic", y="max_per_seat", data=df_perf, whis=np.inf, color="c", order=topic_range)
sns.stripplot(x="topic", y="max_per_seat", data=df_perf, jitter=True, size=3, color=".3", linewidth=0, order=topic_range)

### Ticket amount per seat for each topic

In [None]:
sns.set(style="ticks", palette="muted", color_codes=True)
ax = sns.boxplot(x="topic", y="count_tck_amt", data=df_perf, whis=np.inf, color="c", order=topic_range)
sns.stripplot(x="topic", y="count_tck_amt", data=df_perf, jitter=True, size=3, color=".3", linewidth=0, order=topic_range)

In [None]:
df_perf.head()