In [1]:
# TITLE	Descriptive Statistics - Measures of Central Tendency and variability

# PROBLEM STATEMENT /DEFINITION	Perform the following operations on any open-source dataset (e.g., data.csv)
# 1.	Provide summary statistics (mean, median, minimum, maximum, standard deviation) for a dataset (age, income etc.) 
#       with numeric variables grouped by one of the qualitative (categorical) variable. For example, if your categorical variable is age groups and quantitative variable is income,
#       then provide summary statistics of income grouped by the age groups. Create a list that contains a numeric value for each response to the categorical variable.
# 2.	Write a Python program to display some basic statistical details like percentile,
#       mean, standard deviation etc. of the species of ‘Iris-setosa’, ‘Iris-versicolor’ and ‘Iris- verginica’ of iris.csv dataset.



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('nba.csv')

In [4]:
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [5]:
df.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [6]:
df.shape

(458, 9)

In [7]:
df.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [8]:
#gives the summary statistics of the character columns.
df.describe(include=['object'])

Unnamed: 0,Name,Team,Position,Height,College
count,457,457,457,457,373
unique,457,30,5,18,118
top,Avery Bradley,New Orleans Pelicans,SG,6-9,Kentucky
freq,1,19,102,59,22


In [10]:
#Summary statistics of all the columns
df.describe(include='all')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
count,457,457,457.0,457,457.0,457,457.0,373,446.0
unique,457,30,,5,,18,,118,
top,Avery Bradley,New Orleans Pelicans,,SG,,6-9,,Kentucky,
freq,1,19,,102,,59,,22,
mean,,,17.678337,,26.938731,,221.522976,,4842684.0
std,,,15.96609,,4.404016,,26.368343,,5229238.0
min,,,0.0,,19.0,,161.0,,30888.0
25%,,,5.0,,24.0,,200.0,,1044792.0
50%,,,13.0,,26.0,,220.0,,2839073.0
75%,,,25.0,,30.0,,240.0,,6500000.0


In [18]:
# Computation of measures of central tendency

mean = df['Age'].mean()
print("Mean:", mean)

median = df['Age'].median()
print("Median:",median)

mode = df['Age'].mode()
print("Mode:",mode.iloc[0])  # Note: mode can be a Series, so we use iloc[0] to get the first mode

min_age = df['Age'].min()
max_age = df['Age'].max()
range_age = max_age - min_age
print("Minimum Age:",min_age)
print("Maximum Age:",max_age)
print("Range of Age:",range_age)

# Variance and Standard Deviation
variance_age = df['Age'].var()
std_dev_age = df['Age'].std()
print("Variance:",variance_age)
print("Standard Deviation:",std_dev_age)

Mean: 26.938730853391686
Median: 26.0
Mode: 24.0
Minimum Age: 19.0
Maximum Age: 40.0
Range of Age: 21.0
Variance: 19.395360666436332
Standard Deviation: 4.404016424405833


In [19]:
# Quartiles and Interquartile Range
Q1 = df['Age'].quantile(0.25)
Q2 = df['Age'].quantile(0.5)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

print("Q1:",Q1)
print("Q2 (Median):",Q2)
print("Q3:",Q3)
print("IQR:",IQR)

# Skewness
skewness_age = df['Age'].skew()
print("Skewness:",skewness_age)

# Kurtosis
kurtosis_age = df['Age'].kurt()
print("Kurtosis: ",kurtosis_age)


Q1: 24.0
Q2 (Median): 26.0
Q3: 30.0
IQR: 6.0
Skewness: 0.6263487611614392
Kurtosis:  -0.05111908306655044


In [20]:
# Group by the 'Team' variable and calculate summary statistics for 'Salary'
summary_stats = df.groupby('Team')['Salary'].agg(['mean', 'median', 'min', 'max', 'std']).reset_index()
print(summary_stats)

                      Team          mean     median        min         max  \
0            Atlanta Hawks  4.860197e+06  2854940.0   525093.0  18671659.0   
1           Boston Celtics  4.181505e+06  3021242.5  1148640.0  12000000.0   
2            Brooklyn Nets  3.501898e+06  1335480.0   134215.0  19689000.0   
3        Charlotte Hornets  5.222728e+06  4204200.0   189455.0  13500000.0   
4            Chicago Bulls  5.785559e+06  2380440.0   525093.0  20093064.0   
5      Cleveland Cavaliers  7.642049e+06  4975000.0   111196.0  22970500.0   
6         Dallas Mavericks  4.746582e+06  3950313.0   525093.0  16407500.0   
7           Denver Nuggets  4.294424e+06  2907000.0   258489.0  14000000.0   
8          Detroit Pistons  4.477884e+06  2891760.0   111444.0  16000000.0   
9    Golden State Warriors  5.924600e+06  3815000.0   289755.0  15501000.0   
10         Houston Rockets  5.018868e+06  2288205.0   200600.0  22359364.0   
11          Indiana Pacers  4.450122e+06  4000000.0   211744.0  

In [23]:
grouped_stats = df.groupby('Team')['Age'].describe()

# Print the number of groups
print(grouped_stats)
num_groups = len(grouped_stats)
print("Number of groups: ",num_groups)

                        count       mean       std   min    25%   50%    75%  \
Team                                                                           
Atlanta Hawks            15.0  28.200000  4.229151  22.0  24.00  27.0  31.00   
Boston Celtics           15.0  24.733333  2.840188  20.0  22.00  25.0  27.00   
Brooklyn Nets            15.0  25.600000  3.018988  21.0  23.50  26.0  27.50   
Charlotte Hornets        15.0  26.133333  3.159265  21.0  23.50  27.0  28.50   
Chicago Bulls            15.0  27.400000  4.188419  21.0  24.50  27.0  30.50   
Cleveland Cavaliers      15.0  29.533333  4.120795  24.0  25.00  30.0  33.00   
Dallas Mavericks         15.0  29.733333  3.712271  22.0  28.00  31.0  31.50   
Denver Nuggets           15.0  25.733333  4.742915  20.0  22.00  25.0  28.00   
Detroit Pistons          15.0  26.200000  4.443294  20.0  23.00  25.0  28.50   
Golden State Warriors    15.0  27.666667  3.848314  20.0  25.50  28.0  30.50   
Houston Rockets          15.0  26.866667