In [None]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum
from scipy.stats import ttest_ind

# Initialize Spark
spark = SparkSession.builder.appName("TTestCrimeMeans").getOrCreate()

# Get the JSON file path from arguments
if len(sys.argv) != 2:
    print("Usage: python_script.py <path-to-json>")
    sys.exit(1)

json_file_path = sys.argv[1]  # Path to JSON file on GCS

# Read the JSON file from GCS
crime_df = spark.read.json(json_file_path)

# Calculate Total Crimes per Borough
total_crimes = crime_df.groupBy("borough").agg(_sum("value").alias("total_crimes"))

# Identify the Top 2 Boroughs
top_boroughs = total_crimes.orderBy(col("total_crimes").desc()).limit(2).collect()
top_borough_1 = top_boroughs[0]["borough"]
top_borough_2 = top_boroughs[1]["borough"]

# Extract Crime Values for the Top Boroughs
borough_1_crimes = crime_df.filter(col("borough") == top_borough_1).select("value").rdd.flatMap(lambda x: x).collect()
borough_2_crimes = crime_df.filter(col("borough") == top_borough_2).select("value").rdd.flatMap(lambda x: x).collect()

# Perform the T-Test
t_stat, p_value = ttest_ind(borough_1_crimes, borough_2_crimes, equal_var=False)

# Print Results
print("T-Test Results:")
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")
print(f"Top Borough 1: {top_borough_1}")
print(f"Top Borough 2: {top_borough_2}")

if p_value < 0.05:
    print("The difference in mean total crimes between the two boroughs is statistically significant.")
else:
    print("The difference in mean total crimes between the two boroughs is not statistically significant.")
