In [1]:
import pandas as pd
import scipy.stats

#Importing the dataset
cars_data=pd.read_csv('../Datasets/clean_cars_sampled.csv')

In [2]:
#Creating a copy of dataset
cars=cars_data.copy()

#Setting working range
cars=cars[(cars['yearOfRegistration']<=2018) & (cars['yearOfRegistration']>=1950) & (cars['price']>=100) & (cars['price']<=150000) & (cars['powerPS']>=10) & (cars['powerPS']<=500)]

In [3]:
#One sample test for mean (has price changed from $6000 since last 3 years)
#Taking a sample of 1000 and alpha=0.05
sample_size=1000
sample1=cars.sample(sample_size,random_state=0)

In [4]:
#Postulated mean
pos_mean=6000
#Sample calculated mean
print(sample1['price'].mean())

6557.105


In [5]:
#Getting test staistic value and p value
from scipy.stats import ttest_1samp
statistic,pvalue=ttest_1samp(sample1['price'], pos_mean)

In [6]:
#Calculating critical values
#Getting degrees of freedom
n=len(cars['price'])
df=n-1
print(n,df)
alpha=0.05
#t distribution
from scipy.stats import t
cv=t.ppf([alpha/2,1-alpha/2],df)
print('Critical Values:',cv)
print('Test statistic:',statistic)
print('pvalue:',pvalue)
print('Do not reject null hypotheisis as test statistic lie in critical range and pvalue is greater than alpha (0.05)')

43153 43152
Critical Values: [-1.96001896  1.96001896]
Test statistic: 1.8435932299054856
pvalue: 0.06553847556585954
Do not reject null hypotheisis as test statistic lie in critical range and pvalue is greater than alpha (0.05)


In [7]:
#One sample test for proportion (has automatic transmission changed from 23% since last 3 years)
from statsmodels.stats.proportion import proportions_ztest
p0=0.23

In [8]:
#Count of automatic transmission
count=sample1['gearbox'].value_counts()[1]
#Proportion of different transmissions
nobs=len(sample1['gearbox'])
sample1['gearbox'].value_counts()/nobs

  count=sample1['gearbox'].value_counts()[1]


gearbox
manual       0.783
automatic    0.217
Name: count, dtype: float64

In [9]:
#Calculating test statistic and pvalue
statistic_oneprop,pvalue_oneprop=proportions_ztest(count=count, nobs=nobs, value=p0, alternative='two-sided', prop_var=False)
print(statistic_oneprop, pvalue_oneprop)

-0.9973155816988561 0.31861135280892805


In [10]:
#Getting critical values from normal distribution
from scipy.stats import norm
cv_norm=norm.ppf([alpha/2,1-alpha/2])
print(cv_norm)
print('Do not reject null hypotheisis as test statistic lie in critical range and pvalue is greater than alpha (0.05)')

[-1.95996398  1.95996398]
Do not reject null hypotheisis as test statistic lie in critical range and pvalue is greater than alpha (0.05)


In [11]:
#Two sample test for mean (is the mean price for 30k-60k KM same as 70k-90k KM)
#Subsetting the data
km_70_90=cars[(cars.kilometer <= 90000) & (cars.kilometer>=70000)]
km_30_60=cars[(cars.kilometer <= 60000) & (cars.kilometer>=30000)]
sample_70_90_km=km_70_90.sample(500,random_state=0)
sample_30_60_km=km_30_60.sample(500,random_state=0)

In [12]:
#Sample Variance
print(sample_30_60_km.price.var())
print(sample_70_90_km.price.var())
#Sample Mean
print(sample_30_60_km.price.mean())
print(sample_70_90_km.price.mean())

155442577.9462085
86753098.35060078
14515.678
9450.59


In [13]:
#Computing f statistic
from scipy.stats import f
F=sample_70_90_km.price.var()/sample_30_60_km.price.var()
print(F)

0.5581038316324245


In [14]:
#Calculating degrees of freedom
df1=len(sample_30_60_km)-1
df2=len(sample_70_90_km)-1

In [15]:
#Getting true f value
fvalue=scipy.stats.f.cdf(F, df1, df2)
print(fvalue)

5.04982680053929e-11


In [16]:
#Critical values
f.ppf([alpha/2,1-alpha/2],df1,df2)
print('Reject null hypotheisis as test statistic do not lie in critical range and fvalue is less than alpha (0.05)')

Reject null hypotheisis as test statistic do not lie in critical range and fvalue is less than alpha (0.05)


In [17]:
#Welch t test for unqeual variances
from scipy.stats import ttest_ind
statistic_twomean, pvalue_twomean=ttest_ind(sample_30_60_km.price, sample_70_90_km.price,equal_var=False)
print(statistic_twomean,pvalue_twomean)

7.277610434526923 7.258473522297715e-13


In [18]:
#Calculating degree of freedom
N1=len(sample_30_60_km)
N2=len(sample_70_90_km)
s12=sample_30_60_km.price.var()
s22=sample_70_90_km.price.var()
df=(((s12/N1)+(s22/N2))**2)/((((s12/N1)**2)/(N1-1))+(((s22/N2)**2)/(N2-1)))
print(df)

923.7016134521454


In [19]:
#Critical values
cv_t=t.ppf([alpha/2,1-alpha/2],df)
print(cv_t)
print('Reject null hypotheisis as test statistic do not lie in critical range and pvalue is less than alpha (0.05)')

[-1.96253552  1.96253552]
Reject null hypotheisis as test statistic do not lie in critical range and pvalue is less than alpha (0.05)


In [20]:
#Two sample test for proportion (are proportion of petrol cars from 2009-2013 and 2014-2018 different)
#Subsetting based on year
year_14_18=cars[(cars.yearOfRegistration<=2018) & (cars.yearOfRegistration>=2014)]
year_09_13=cars[(cars.yearOfRegistration<=2013) & (cars.yearOfRegistration>=2009)]

In [21]:
#Taking 1000 samples
sample_14_18=year_14_18.sample(1000,random_state=3)
sample_09_13=year_09_13.sample(1000,random_state=3)

In [22]:
#Calculating the proportion of both
from statsmodels.stats.proportion import proportions_ztest
count=[(sample_14_18['fuelType']=='petrol').sum(),(sample_09_13['fuelType']=='petrol').sum()]
nobs=[len(sample_14_18),len(sample_09_13)]
print(count[0]/nobs[0])
print(count[1]/nobs[1])

0.734
0.522


In [23]:
#Calculating statistic and pvalue
statistic,pvalue=proportions_ztest(count=count,nobs=nobs,value=0,alternative='two-sided',prop_var=False)
print(statistic,pvalue)

9.807754158877733 1.0426316599022217e-22


In [24]:
#Getting normal critical values
cv=norm.ppf([alpha/2,1-alpha/2])
print(cv)
print('Reject null hypotheisis as test statistic do not lie in critical range and pvalue is less than alpha (0.05)')

[-1.95996398  1.95996398]
Reject null hypotheisis as test statistic do not lie in critical range and pvalue is less than alpha (0.05)


In [25]:
#Chi-square test of independence (is vehicleType dependent on fuelType)
#Setting crosstab between fueltype and vehicletype
cross_table=pd.crosstab((cars['fuelType']),cars['vehicleType'])

In [26]:
#Applying function chi2_contigency
cont=scipy.stats.chi2_contingency(cross_table)
print(cont)

Chi2ContingencyResult(statistic=8223.500448347113, pvalue=0.0, dof=42, expected_freq=array([[5.96259820e+00, 4.62331703e+00, 3.74472227e+00, 2.42781730e+01,
        5.39661205e-01, 1.53951521e+01, 1.34174913e+01, 3.03888490e+00],
       [1.09417876e+03, 8.48411234e+02, 6.87182907e+02, 4.45521572e+03,
        9.90316316e+01, 2.82511883e+03, 2.46220413e+03, 5.57656779e+02],
       [8.39802563e-01, 6.51171413e-01, 5.27425671e-01, 3.41946099e+00,
        7.60086205e-02, 2.16833129e+00, 1.88978750e+00, 4.28011957e-01],
       [3.02328923e+00, 2.34421709e+00, 1.89873242e+00, 1.23100596e+01,
        2.73631034e-01, 7.80599263e+00, 6.80323500e+00, 1.54084305e+00],
       [5.85342386e+01, 4.53866475e+01, 3.67615693e+01, 2.38336431e+02,
        5.29780085e+00, 1.51132691e+02, 1.31718189e+02, 2.98324334e+01],
       [5.03881538e-01, 3.90702848e-01, 3.16455403e-01, 2.05167659e+00,
        4.56051723e-02, 1.30099877e+00, 1.13387250e+00, 2.56807174e-01],
       [2.46095743e+03, 1.90819271e+03, 1.545

In [27]:
#Calculating degrees of freedom
df=(cross_table.shape[0]-1)*(cross_table.shape[1]-1)
print(df)

42


In [28]:
#Getting critical values
from scipy.stats import chi2
chi2.ppf(q=[alpha/2,1-alpha/2],df=42)
print('Reject null hypotheisis as chi2 statistic do not lie in critical range and pvalue is less than alpha (0.05) conclude vehicleType is not dependent on fuelType')

Reject null hypotheisis as chi2 statistic do not lie in critical range and pvalue is less than alpha (0.05) conclude vehicleType is not dependent on fuelType
