In [3]:
#Confidence Intervals Using the t Distribution
#The following example shows how to calculate a confidence interval for the true population mean height (in inches) of a certain species of plant, using a sample of 15 plants:
#If we’re working with a small sample (n <30), we can use the t.interval() function from the scipy.stats library to calculate a confidence interval for a population mean.

import numpy as np
import scipy.stats as st

#define sample data
data = [12, 12, 13, 13, 15, 16, 17, 22, 23, 25, 26, 27, 28, 28, 29]

#create 95% confidence interval for population mean weight. You’ll notice that the larger the confidence level, the wider the confidence interval.
st.t.interval(confidence = 0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))



(17.408981355522524, 23.391018644477473)

That is, there’s only a 5% chance that the true population mean height of plants is less than 16.758 inches or greater than 24.042 inches.

In [5]:
#Confidence Intervals Using the Normal Distribution
#If we’re working with larger samples (n≥30), we can assume that the sampling distribution of the sample mean is normally distributed
#(thanks to the Central Limit Theorem) and can instead use the norm.interval() function from the scipy.stats library.


#define sample data
np.random.seed(0)
data = np.random.randint(10, 30, 50)

#create 95% confidence interval for population mean weight
st.norm.interval(confidence = 0.95, loc=np.mean(data), scale=st.sem(data))


(17.695874465475686, 20.78412553452431)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel(open("Descriptive-statistics-HomePrice.xlsx", "rb"), "Data")
df.head()

  warn(msg)


Unnamed: 0,ID,Building,Year of sale,Month of sale,Type of property,Property #,Area (ft.),Price,Status,Customer ID,...,Y,M,D,Gender,Country,State,Purpose,Deal satisfaction,Mortgage,Source
0,1030,1,2005.0,11.0,Apartment,30,743.0856,246172.676,Sold,C0028,...,1986.0,6.0,21.0,F,USA,California,Home,5.0,No,Website
1,1029,1,2005.0,10.0,Apartment,29,756.2128,246331.904,Sold,C0027,...,1983.0,2.0,24.0,F,USA,California,Home,5.0,No,Website
2,2002,2,2007.0,7.0,Apartment,2,587.2808,209280.9104,Sold,C0112,...,1985.0,12.0,27.0,M,USA,California,Home,1.0,Yes,Client
3,2031,2,2007.0,12.0,Apartment,31,1604.7464,452667.0064,Sold,C0160,...,1985.0,12.0,27.0,M,USA,California,Investment,3.0,Yes,Website
4,1049,1,2004.0,11.0,Apartment,49,1375.4508,467083.3132,Sold,C0014,...,1979.0,5.0,15.0,F,USA,California,Home,4.0,No,Agency


In [None]:
#99% confidence interval for price mean
data = df['Price']
st.t.interval(confidence = 0.99, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

(254640.55365368252, 286667.61461461004)

In [None]:
#Create a mple from our population
sampled_df = df.sample(n=200, random_state=100)
#optional: I used the whole dataset
sampled_df = df

In [None]:
# Remove Empty Value from List
sampled_df.dropna(inplace=True)

In [None]:
WebsiteBuyer = sampled_df[sampled_df['Source']=='Website']['Age at time of purchase']
ClientBuyer = sampled_df[sampled_df['Source']=='Client']['Age at time of purchase']
print(WebsiteBuyer.shape)
print(ClientBuyer.shape)

(95,)
(17,)


In [None]:
WebsiteBuyer

0      19
1      22
3      22
8      27
11     26
       ..
170    69
171    69
172    71
173    71
177    76
Name: Age at time of purchase, Length: 95, dtype: object

In [None]:
femaleBuyer = sampled_df.loc[(sampled_df['Source']=='Website') & (sampled_df['Gender']=='F')]['Age at time of purchase']
print(femaleBuyer.shape)
maleBuyer = sampled_df.loc[(sampled_df['Source']=='Website') & (sampled_df['Gender']=='M')]['Age at time of purchase']
print(maleBuyer.shape)

(38,)
(57,)


In [None]:
#from scipy.stats import t
rv = st.t(df=maleBuyer.shape[0] + femaleBuyer.shape[0] -2)

In [None]:
#from scipy.stats import ttest_ind
t_stat,pvalue = st.ttest_ind(femaleBuyer,maleBuyer)
print(pvalue,t_stat)

0.9345456425437244 0.08234966517675023


In [None]:
FemaleBuyer = sampled_df[sampled_df['Gender']=='F']['Price']
MaleBuyer = sampled_df[sampled_df['Gender']=='M']['Price']
print(FemaleBuyer.shape)
print(MaleBuyer.shape)

(65,)
(99,)


In [None]:
rv = st.t(df= FemaleBuyer.shape[0] + MaleBuyer.shape[0]-2)
t_stat, pvalue = st.ttest_ind(FemaleBuyer, MaleBuyer)
pvalue

0.6170586622511833