In [74]:
# H0 : No significant difference in mean agility between the two periods.
# H1: Mean agility has significantly increased in the second period (2022-2023 to 2023-2024).

In [75]:
import pandas as pd
from scipy.stats import shapiro, ttest_ind , stats

In [76]:
# Load the data
data = pd.read_csv("D:/Bootcamp/mini project/top50players_data.csv")
data.head()

Unnamed: 0,rank,player_name,player_id,team_id,year,height_cm,weight_kg
0,1,James Harden,h/hardeja01,HOU,2020,196,99
1,2,Damian Lillard,l/lillada01,POR,2020,188,88
2,3,Devin Booker,b/bookede01,PHO,2020,198,93
3,4,Giannis Antetokounmpo,a/antetgi01,MIL,2020,211,109
4,5,Trae Young,y/youngtr01,ATL,2020,185,74


In [77]:
# Create a copy of the data to avoid modifying the original dataset
data_copy = data.copy()

In [78]:
# Filter data for top 25 players and exclude the year 2020
filtered_data = data_copy[(data_copy['rank'] <= 20) & (data_copy['year'] != 2020)]
filtered_data.head()

Unnamed: 0,rank,player_name,player_id,team_id,year,height_cm,weight_kg
50,1,Stephen Curry,c/curryst01,GSW,2021,188,83
51,2,Damian Lillard,l/lillada01,POR,2021,188,88
52,3,Nikola Jokić,j/jokicni01,DEN,2021,211,128
53,4,Bradley Beal,b/bealbr01,WAS,2021,193,93
54,5,Luka Dončić,d/doncilu01,DAL,2021,198,104


In [79]:
# Calculate agility as height divided by weight
filtered_data.loc[:, 'agility'] = filtered_data['height_cm'] / filtered_data['weight_kg']
filtered_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, 'agility'] = filtered_data['height_cm'] / filtered_data['weight_kg']


Unnamed: 0,rank,player_name,player_id,team_id,year,height_cm,weight_kg,agility
50,1,Stephen Curry,c/curryst01,GSW,2021,188,83,2.26506
51,2,Damian Lillard,l/lillada01,POR,2021,188,88,2.136364
52,3,Nikola Jokić,j/jokicni01,DEN,2021,211,128,1.648438
53,4,Bradley Beal,b/bealbr01,WAS,2021,193,93,2.075269
54,5,Luka Dončić,d/doncilu01,DAL,2021,198,104,1.903846


In [80]:
# Divide data into two periods
period_1 = filtered_data[filtered_data['year'].between(2021, 2022)]  # 2020-2021 to 2021-2022
period_1.head()

Unnamed: 0,rank,player_name,player_id,team_id,year,height_cm,weight_kg,agility
50,1,Stephen Curry,c/curryst01,GSW,2021,188,83,2.26506
51,2,Damian Lillard,l/lillada01,POR,2021,188,88,2.136364
52,3,Nikola Jokić,j/jokicni01,DEN,2021,211,128,1.648438
53,4,Bradley Beal,b/bealbr01,WAS,2021,193,93,2.075269
54,5,Luka Dončić,d/doncilu01,DAL,2021,198,104,1.903846


In [81]:
period_2 = filtered_data[filtered_data['year'].between(2023, 2024)]  # 2022-2023 to 2023-2024
period_2.head()

Unnamed: 0,rank,player_name,player_id,team_id,year,height_cm,weight_kg,agility
150,1,Jayson Tatum,t/tatumja01,BOS,2023,203,95,2.136842
151,2,Joel Embiid,e/embiijo01,PHI,2023,213,127,1.677165
152,3,Luka Dončić,d/doncilu01,DAL,2023,198,104,1.903846
153,4,Shai Gilgeous-Alexander,g/gilgesh01,OKC,2023,198,90,2.2
154,5,Giannis Antetokounmpo,a/antetgi01,MIL,2023,211,109,1.93578


In [82]:
# Calculate mean agility for both periods
mean_agility_1 = period_1['agility'].mean()
mean_agility_2 = period_2['agility'].mean()

In [83]:
# Test for normality in both periods
shapiro_1 = shapiro(period_1['agility'])
print(f"Shapiro-Wilk for Period 1:{shapiro_1}")
shapiro_2 = shapiro(period_2['agility'])
print(f"Shapiro-Wilk for Period 2:", shapiro_2)

Shapiro-Wilk for Period 1:ShapiroResult(statistic=np.float64(0.9745229825531916), pvalue=np.float64(0.349908702863736))
Shapiro-Wilk for Period 2: ShapiroResult(statistic=np.float64(0.9765830979718725), pvalue=np.float64(0.4184945510825269))


In [88]:
# Print the normality test results
print(f"Shapiro-Wilk Test for Period 1:")

print(f"  Statistic: {shapiro_1.statistic:.4f}")

print(f"  P-value: {shapiro_1.pvalue:.4f}")

print(f"  Normality: {'Accepted' if shapiro_1.pvalue > 0.05 else 'Rejected'}")

print(f"Shapiro-Wilk Test for Period 2:")

print(f"  Statistic: {shapiro_2.statistic:.4f}")

print(f"  P-value: {shapiro_2.pvalue:.4f}")

print(f"  Normality: {'Accepted' if shapiro_2.pvalue > 0.05 else 'Rejected'}")

Shapiro-Wilk Test for Period 1:
  Statistic: 0.9745
  P-value: 0.3499
  Normality: Accepted
Shapiro-Wilk Test for Period 2:
  Statistic: 0.9766
  P-value: 0.4185
  Normality: Accepted


In [91]:
# Perform independent T-test to compare means of the two periods
t_stat, p_value = ttest_ind(period_1['agility'], period_2['agility'], equal_var=False)

# Print T-test results
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

# Hypothesis Testing
# H0 (Null Hypothesis): There is no difference in the mean agility between the two periods.
# H1 (Alternative Hypothesis): There is a significant increase in agility in the second period compared to the first period.

# Check the hypothesis test result
if p_value < 0.05:
    print("The null hypothesis (H0) is rejected.")
else:
    print("The null hypothesis (H0) is not rejected.")

T-statistic: -0.476
P-value: 0.635
The null hypothesis (H0) is not rejected.
