In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:
url = 'https://docs.google.com/spreadsheets/d/1EvADR_JB1Y7tQDGvu-5aBoCEPPS-Dqyca3YY_Jjakdw/gviz/tq?tqx=out:csv&sheet=Cleaned_Super_Hero'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65,...,False,False,False,False,False,False,False,False,False,False
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122,...,False,False,False,False,False,False,False,False,False,False


Explore the data:
How many are in the group?

In [3]:
# see how many with and without strength
df['Super Strength'].value_counts()

True     274
False    189
Name: Super Strength, dtype: int64

Split the data into two groups

In [4]:
# Filtering out those with and without super strength
strength_df = df.loc[df['Super Strength']== True].copy()
no_strength_df = df.loc[df['Super Strength']== False].copy()

define our feature of interest.

In [5]:
strength_weight = strength_df['Weight']
no_strength_weight = no_strength_df['Weight']

## Check for and remove outliers

We will check each group separately for outliers. We will start with the strength_weight group.

In [6]:
# Check for outliers in weight for strength group
zscores= stats.zscore(strength_weight)
outliers = abs(zscores)>3
np.sum(outliers)

6

remove the outliers from strength

In [7]:
# remove outliers from strength group
strength_weight = strength_weight[(np.abs(stats.zscore(strength_weight)) < 3)]

check no strength weight for outliers

In [8]:
# check for outliers in weight for no strength group
zscores= stats.zscore(no_strength_weight)
outliers = abs(zscores)>3
np.sum(outliers)

1

remove no strength outliers

In [9]:
# remove outliers from no_strength group
no_strength_weight = no_strength_weight[(np.abs(stats.zscore(no_strength_weight)) < 3)]

Check for normality

In [10]:
# test the strength group for normality
result_strength_weight = stats.normaltest(strength_weight)
result_strength_weight

NormaltestResult(statistic=105.002416481251, pvalue=1.5813025374989874e-23)

In [11]:
result_no_strength_weight = stats.normaltest(no_strength_weight)
result_no_strength_weight

NormaltestResult(statistic=10.034466263303866, pvalue=0.006622825869229763)

Our p-values for both group are well below 0.05 which means our data is NOT normally distributed.  However, our our sample size is large enough to proceed without satisfying this test.  For an independent t test, sample sizes of 15 per group are considered sufficient.

Check for equal Variance

In [12]:
# Test for equal variance
result = stats.levene(strength_weight, no_strength_weight)
result

LeveneResult(statistic=54.1012395290633, pvalue=8.971388915444231e-13)

Looks like we don't have equal variances, but that won't stop us! We just need to be sure to include "equal_var = False" when we perform our t-test.



## Perform and evaluate the t-test

We will perform the independent t-test 

In [13]:
result = stats.ttest_ind(strength_weight, no_strength_weight, equal_var= False)
result

Ttest_indResult(statistic=9.12425300025232, pvalue=1.0881200502443155e-17)