In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind

In [None]:
fp = os.path.join(os.path.dirname(os.path.realpath('eda.ipynb')) + '/cleaned_runway.csv')

In [None]:
runway = pd.read_csv(fp)
runway.head()

In [None]:
runway.dtypes

In [None]:
runway.shape

In [None]:
runway.isna().sum()

In [None]:
# NLP techniques can be used on review_summary and review_text!

In [None]:
runway.describe()

In [None]:
# Investigate what might be worth setting as label outcome
print(runway['category'].nunique())
runway['category'].value_counts()
# with many categories, this attribute may be useful in something like word2vec for decided label outcome

In [None]:
# 68 unique categories difficult to predict, explore other categorical attributes
runway['rented for'].value_counts()
# only 8 categories! For simplicity, we'll attempt to build a model to predict "rented for"!

In [None]:
runway['rented for'].value_counts(normalize = True).plot(kind = 'bar', title = "Rented For Proportions")

In [None]:
# are there general differences in ages between rentals for different event types?
runway.boxplot(column = 'age', by = 'rented for')
# very slight differences in ages of buyers, likely not worth keeping in model dev

In [None]:
# run pearson chi-square test on ages between rented for categories
expected = runway['age'].mean()
age_means = runway.groupby('rented for')['age'].mean().values
data = [[expected] * len(age_means), list(age_means)]
stat, p, dof, expected = chi2_contingency(data)
p
# chi square test confirms

In [None]:
# it might make more sense to test each event type against all others using hypothesis testing
def hypothesis_test(att, event_type, alpha_level):
    filtered = runway[runway['rented for'] == event_type][att]
    others = runway[runway['rented for'] != event_type][att]
    p_val = ttest_ind(filtered, others, equal_var = False).pvalue
    print(event_type + " vs. all other event types for " + att)
    print("-------------------------------------")
    print("p-value: " + str(p_val))
    if p_val < alpha_level:
        print("reject the null hypothesis; likely different distributions\n")
    else:
        print("do not reject null hypothesis; likely same distributions\n")

In [None]:
rented_for = runway['rented for'].unique()

In [None]:
for event in rented_for:
    hypothesis_test('age', event, 0.01)
# significance seen in almost all event types

In [None]:
# do people who buy for certain events tend to wear larger/smaller sizes?
runway.boxplot(column = 'size', by = 'rented for')
# There are large enough visual differences to motivate including size in model

In [None]:
for event in rented_for:
    hypothesis_test('size', event, 0.01)

In [None]:
runway.boxplot(column = 'height', by = 'rented for')
# heights look very similar

In [None]:
for event in rented_for:
    hypothesis_test('height', event, 0.01)

In [None]:
runway.boxplot(column = 'weight', by = 'rented for')
# weight also look very similar

In [None]:
for event in rented_for:
    hypothesis_test('weight', event, 0.01)

In [None]:
# construct plots for each event type of buyers' weights vs. heights
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 12))
for event, ax in zip(rented_for, axs.ravel()):
    filtered = runway[runway['rented for'] == event]
    ax.scatter(filtered.weight, filtered.height)
    ax.title.set_text(event + " rental weights vs. heights")
    ax.set_yticks([i for i in range(55, 80, 5)])
    ax.set_xticks([i for i in range(50, 350, 50)])
    ax.set_xlabel('weight')
    ax.set_ylabel('height')
# many distributions appear to have larger spreads, even if they are outliers

In [None]:
# how much of each type of clothing is purchased for each event type?
clothing_event = runway.groupby(['rented for', 'category'])['fit'].count().unstack().fillna(0)
clothing_event

In [None]:
# calculate proportions of clothing types for each event type
clothing_props = clothing_event / clothing_event.sum()
clothing_props

In [None]:
# which clothing types don't have any proportion above the threshold for any event type?
threshold = 0.3
((clothing_props < threshold).apply(all))[((clothing_props < threshold).apply(all)) == True]

In [None]:
# if the proportions of these clothings are too uniform, 
# it might be worth converting them to an "other" clothing type
clothing_props['shift']
# consists mainly of 2 event types

In [None]:
clothing_props['skirts']
# consists of 4 event types split evenly, this is not so bad

In [None]:
# how has renting for each event increased/decreased over the years?
vals = [runway.groupby(['rented for', 'year'])['fit'].count()[clothing].values for clothing in rented_for]
# fill 0 for counts of years with 0 of two "rented for" categories
vals[5] = ([0] * 4) + list(vals[5])
vals[7] = ([0] * 4) + list(vals[7])
yrs = [yr for yr in range(2011, 2019)]
for i, clothing in enumerate(rented_for):
    plt.plot(yrs, vals[i], label = clothing)
plt.title("Counts per Year for Event Types")
plt.legend()
plt.show()
# sales for certain events jump and dip certain years (more/less bought)
# this could be useful as a feature in model dev

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 12))
month_counts = runway.groupby(['rented for', 'month']).count().reset_index()
for event, ax in zip(rented_for, axs.ravel()):
    df = month_counts[month_counts['rented for'] == event].set_index('month')['fit']
    ax.bar(df.index, df.values)
    ax.title.set_text(event + " rental month counts")
    ax.set_xticks([i for i in range(1, 13)])
    ax.set_xlabel("month number")
# it appears that clothing for certain event types is more commonly bought during certain months
# therefore, month is a useful attribute

In [None]:
# In order to test if distributions are significantly different, we'll use 
# a modified form of permutation testing
def permutation_test(n_trials, att, event_type, alpha_level):
    '''
    tests proportional distribution of an attribute for one event type against the
    proportional distribution of that attribute for all other event types combined
    to test if distributions are significantly different
    
    n_trials: number of permutation iterations
    att: attribute to test distributions between event types
    event_type: event type to test against all others
    '''
    # filter for only specified event type
    filtered = runway[runway['rented for'] == event_type].groupby(att).count()['category']
    filtered_props = filtered / filtered.sum()
    # filter for everything but specified event type
    not_event = runway[runway['rented for'] != event_type].groupby(att).count()['category']
    not_event_props = not_event / not_event.sum()
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(6, 3))
    for i, ax in enumerate(axs.ravel()):
        if i == 0:
            ax.bar(filtered_props.index, filtered_props.values)
            ax.title.set_text(event)
        if i == 1:
            ax.bar(not_event_props.index, not_event_props.values)
            ax.title.set_text("other events")
    plt.show()
    # calculate observered sum of absolute differences
    obs_error = sum(abs(filtered_props - not_event_props))
    trials = [0] * n_trials
    for i in range(n_trials):
        permuted_col = runway[att].sample(frac = 1, replace = False).values
        permuted = runway.assign(p = permuted_col)
        filtered = permuted[permuted['rented for'] == event_type].groupby('p').count()['category']
        filtered_props = filtered / filtered.sum()
        not_event = permuted[permuted['rented for'] != event_type].groupby('p').count()['category']
        not_event_props = not_event / not_event.sum()
        trials[i] = sum(abs(filtered_props - not_event_props))
    print(event_type + " vs. all other event types for " + att)
    print("-------------------------------------")
    p_val = np.count_nonzero(np.array(trials) >= obs_error) / n_trials
    print("p-value: " + str(p_val))
    if p_val < alpha_level:
        print("reject the null hypothesis; likely different distributions")
    else:
        print("do not reject null hypothesis; likely same distributions")
    

In [None]:
for event in rented_for:
    permutation_test(200, 'month', event, 0.01)

In [None]:
# Even though theoretically rating should not influence which event type was rented for, it's possible that this
# clothing renter has worse quality with certain clothing types, for example
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 12))
for event, ax in zip(rented_for, axs.ravel()):
    df = runway[runway['rented for'] == event]['rating']
    ax.hist(df.values)
    ax.title.set_text(event + " rental ratings")
    ax.set_xticks([i for i in range(0, 11)])
    ax.set_xlabel("rating")
# all event types appear to have very similar distributions of ratings; rating is likely not important

In [None]:
for event in rented_for:
    permutation_test(200, 'rating', event, 0.01)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 12))
body_counts = runway.groupby(['rented for', 'body type']).count().reset_index()
for event, ax in zip(rented_for, axs.ravel()):
    df = body_counts[body_counts['rented for'] == event].set_index('body type')['fit']
    ax.bar(df.index, df.values)
    ax.title.set_text(event + " rental body type counts")
# all of the distributions look only very slightly different, so not worth including body type

In [None]:
for event in rented_for:
    permutation_test(200, 'body type', event, 0.01)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 12))
bust_counts = runway.groupby(['rented for', 'bust size']).count().reset_index()
for event, ax in zip(rented_for, axs.ravel()):
    df = bust_counts[bust_counts['rented for'] == event].set_index('bust size')['fit']
    ax.bar(df.index, df.values)
    ax.title.set_text(event + " rental bust size counts")

In [None]:
for event in rented_for:
    permutation_test(200, 'bust size', event, 0.01)