### Statistical Analysis in Python with Datasets

##### import libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [5]:
#Reading Dataset 
df=pd.read_csv(r"C:\Users\bevar\Downloads\medicine_Estimation_cost.csv")

In [6]:
df

Unnamed: 0,AGE,SEX,BMI,STUDENTS,SMOKING,FINE
0,18,1,33.770,1,0,1725.55230
1,18,1,34.100,0,0,1137.01100
2,18,0,26.315,0,0,2198.18985
3,18,0,38.665,2,0,3393.35635
4,18,0,35.625,0,0,2211.13075
...,...,...,...,...,...,...
1333,64,0,31.825,2,0,16069.08475
1334,64,0,26.885,0,1,29330.98315
1335,64,1,26.410,0,0,14394.55790
1336,64,1,36.960,2,1,49577.66240


##### First Five Rows

In [7]:
df.head()

Unnamed: 0,AGE,SEX,BMI,STUDENTS,SMOKING,FINE
0,18,1,33.77,1,0,1725.5523
1,18,1,34.1,0,0,1137.011
2,18,0,26.315,0,0,2198.18985
3,18,0,38.665,2,0,3393.35635
4,18,0,35.625,0,0,2211.13075


### Mean

In [34]:
df.mean()

AGE            39.207025
SEX             0.505232
BMI            30.663397
STUDENTS        1.094918
SMOKING         0.204783
FINE        13270.422265
dtype: float64

### Median

In [9]:
df.median()

AGE           39.000
SEX            1.000
BMI           30.400
STUDENTS       1.000
SMOKING        0.000
FINE        9382.033
dtype: float64

### Mode

In [10]:
df.mode()

Unnamed: 0,AGE,SEX,BMI,STUDENTS,SMOKING,FINE
0,18,1,32.3,0,0,1639.5631


### Variance

In [11]:
df.var()

AGE         1.974014e+02
SEX         2.501596e-01
BMI         3.718788e+01
STUDENTS    1.453213e+00
SMOKING     1.629689e-01
FINE        1.466524e+08
dtype: float64

### Standard Deviation

In [12]:
df.std()

AGE            14.049960
SEX             0.500160
BMI             6.098187
STUDENTS        1.205493
SMOKING         0.403694
FINE        12110.011237
dtype: float64

### Kurtosis

In [13]:
df.kurtosis()

AGE        -1.245088
SEX        -2.002557
BMI        -0.050732
STUDENTS    0.202454
SMOKING     0.145756
FINE        1.606299
dtype: float64

### Skewness

In [14]:
df.skew()

AGE         0.055673
SEX        -0.020951
BMI         0.284047
STUDENTS    0.938380
SMOKING     1.464766
FINE        1.515880
dtype: float64

### Range

In [15]:
df.max()-df.min()

AGE            46.00000
SEX             1.00000
BMI            37.17000
STUDENTS        5.00000
SMOKING         1.00000
FINE        62648.55411
dtype: float64

### Inferential Statistics

In [18]:
BMI=df["BMI"]
BMI

0       33.770
1       34.100
2       26.315
3       38.665
4       35.625
         ...  
1333    31.825
1334    26.885
1335    26.410
1336    36.960
1337    23.760
Name: BMI, Length: 1338, dtype: float64

In [19]:
population_mean = 0.03

### T-test,P-test

In [20]:
t_stat, p_value = stats.ttest_1samp(BMI, population_mean)

In [21]:
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

T-Statistic: 183.7479415238182
P-Value: 0.0


### Confidence Intervals

In [23]:
sample_mean = np.mean(BMI)
standard_error = stats.sem(BMI)

#### 95% Confidence Interval

In [26]:
confidence_interval = stats.norm.interval(0.95, loc=sample_mean, scale=standard_error)

In [27]:
print(f"95% Confidence Interval for BMI: {confidence_interval}")

95% Confidence Interval for BMI: (30.336642971534822, 30.990150750438275)


#### 99% Confidence Interval

In [43]:
confidence_interval = stats.norm.interval(0.99, loc=sample_mean, scale=standard_error)

In [44]:
print(f"99% Confidence Interval for BMI: {confidence_interval}")

99% Confidence Interval for BMI: (30.233969458168303, 31.092824263804793)


### Regression Analysis

In [30]:
X = sm.add_constant(df['BMI'])

In [37]:
X

Unnamed: 0,const,BMI
0,1.0,33.770
1,1.0,34.100
2,1.0,26.315
3,1.0,38.665
4,1.0,35.625
...,...,...
1333,1.0,31.825
1334,1.0,26.885
1335,1.0,26.410
1336,1.0,36.960


In [31]:
y = df['FINE']

In [38]:
y

0        1725.55230
1        1137.01100
2        2198.18985
3        3393.35635
4        2211.13075
           ...     
1333    16069.08475
1334    29330.98315
1335    14394.55790
1336    49577.66240
1337    26926.51440
Name: FINE, Length: 1338, dtype: float64

In [41]:
model = sm.OLS(y, X).fit()

In [39]:
model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x26b7de18a50>

In [40]:
model.summary()

0,1,2,3
Dep. Variable:,FINE,R-squared:,0.039
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,54.71
Date:,"Sun, 08 Sep 2024",Prob (F-statistic):,2.46e-13
Time:,22:58:14,Log-Likelihood:,-14451.0
No. Observations:,1338,AIC:,28910.0
Df Residuals:,1336,BIC:,28920.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1192.9372,1664.802,0.717,0.474,-2072.974,4458.849
BMI,393.8730,53.251,7.397,0.000,289.409,498.337

0,1,2,3
Omnibus:,261.03,Durbin-Watson:,1.85
Prob(Omnibus):,0.0,Jarque-Bera (JB):,431.091
Skew:,1.297,Prob(JB):,2.45e-94
Kurtosis:,4.004,Cond. No.,160.0
