In [77]:
import os
os.environ['PYSPARK_PYTHON']='python'

In [78]:
import pandas as pd
import numpy as np
from scipy import stats  # corrected from 'states' to 'stats'
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, desc
import matplotlib.pyplot as plt
import seaborn as sns


In [79]:
# Create a Spark session
spark = SparkSession.builder.appName("worksheetsignificance").getOrCreate()

In [80]:
state_df = spark.read.csv("state.csv", header=True, inferSchema=True)
state_df.show(4)

+----------+------+----------+--------+------+-------+-----+------+
|Population|Income|Illiteracy|Life Exp|Murder|HS Grad|Frost|  Area|
+----------+------+----------+--------+------+-------+-----+------+
|      3615|  3624|       2.1|   69.05|  15.1|   41.3|   20| 50708|
|       365|  6315|       1.5|   69.31|  11.3|   66.7|  152|566432|
|      2212|  4530|       1.8|   70.55|   7.8|   58.1|   15|113417|
|      2110|  3378|       1.9|   70.66|  10.1|   39.9|   65| 51945|
+----------+------+----------+--------+------+-------+-----+------+
only showing top 4 rows


In [81]:
state_pd = state_df.toPandas()
state_pd.head(4)

Unnamed: 0,Population,Income,Illiteracy,Life Exp,Murder,HS Grad,Frost,Area
0,3615,3624,2.1,69.05,15.1,41.3,20,50708
1,365,6315,1.5,69.31,11.3,66.7,152,566432
2,2212,4530,1.8,70.55,7.8,58.1,15,113417
3,2110,3378,1.9,70.66,10.1,39.9,65,51945


In [82]:
state_pd.shape

(50, 8)

Hypothesis Testing

In [83]:
# Test whether the mean murder rate is 8.5

murder_data = state_pd["Murder"].values
print(murder_data)

[15.1 11.3  7.8 10.1 10.3  6.8  3.1  6.2 10.7 13.9  6.2  5.3 10.3  7.1
  2.3  4.5 10.6 13.2  2.7  8.5  3.3 11.1  2.3 12.5  9.3  5.   2.9 11.5
  3.3  5.2  9.7 10.9 11.1  1.4  7.4  6.4  4.2  6.1  2.4 11.6  1.7 11.
 12.2  4.5  5.5  9.5  4.3  6.7  3.   6.9]


In [84]:
np.mean(murder_data)

np.float64(7.377999999999999)

In [85]:
t_stat = (np.mean(murder_data) - 8.5)/ (np.std(murder_data, ddof=1)/np.sqrt(len(murder_data)))

In [86]:
df = len(murder_data)

In [87]:
p_value = 2* stats.t.sf(np.abs(t_stat), df)

In [88]:
ci = stats.t.interval(0.95,df,
                      loc= np.mean(murder_data),
                      scale = np.std(murder_data, ddof =1)/np.sqrt(len(murder_data)))

In [89]:
print("Mean", np.mean(murder_data))
print("sample standard deviation: ", np.std(murder_data))

Mean 7.377999999999999
sample standard deviation:  3.6544378500666825


In [90]:
print("Test statistics: ", t_stat)

Test statistics:  -2.1491677577323403


IMPORTANT NOTE:

t_stat = 2.149 this means:

*the sample mean -2.149 standard errors awai from 8.5

*negative sign means the sample means is velow 8.5

*the larger the |t|, the more unusual our sample is 

|t| = 0.5 ----> very close to hypothesized mean (not unusual)

|t| = 1.5 ----> moderately far from hypothesized mean

|t| = 2.1 far from hypothesized mean

|t| = 5.0 ----> very far from hypothesized mean




In [91]:
print("confidence interval", ci)

confidence interval (np.float64(6.3294065080918065), np.float64(8.426593491908193))


In [92]:
print("p_value: ", p_value)

p_value:  0.03648613377210172


In [93]:
print(f'we {"REJECT H0" if p_value <= 0.05 else "FAIL TO REJECT H0"}')


we REJECT H0
