In [None]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from google.api_core.retry import Retry

from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.mllib.regression import LabeledPoint

from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors


In [None]:
# Initialize Spark
spark = SparkSession.builder.appName("Chi-Squared").getOrCreate()

# Get the JSON file path from arguments
if len(sys.argv) != 2:
    print("Usage: python_script.py <path-to-json>")
    sys.exit(1)

json_file_path = sys.argv[1]  # Path to JSON file on GCS

# Read the JSON file from GCS
crime_df = spark.read.json(json_file_path)

In [None]:
# Create a Contingency Table
contingency_df = crime_df.groupBy("borough", "major_category").count() \
                         .groupBy("borough") \
                         .pivot("major_category") \
                         .sum("count") \
                         .fillna(0)

# Convert all feature columns to DoubleType
contingency_df = contingency_df.select(
    [col(c).cast(DoubleType()).alias(c) if c != "borough" else col(c) for c in contingency_df.columns]
)

# Create a dictionary mapping boroughs to numeric values 
boroughs = contingency_df.select("borough").distinct().rdd.flatMap(lambda x: x).collect() 
borough_mapping = {borough: idx for idx, borough in enumerate(boroughs)}

# Convert Rows to LabeledPoint
def row_to_labeled_point(row,borough_mapping):
    
    label = borough_mapping[row["borough"]] #mine

    features = [row[col] for col in row if col != "borough"]  # All other columns are features
    
    return (label,features)



In [None]:
# Convert the contingency DataFrame to an RDD of LabeledPoint
labeled_rdd = contingency_df.rdd.map(lambda row: row_to_labeled_point(row.asDict(),borough_mapping))

In [None]:
# Convert the data to RDD of LabeledPoint 
labeled_rdd2 = labeled_rdd.map(lambda x: LabeledPoint(x[0], Vectors.dense(x[1])))

# Perform chi-squared test
chi_sq_result = Statistics.chiSqTest(labeled_rdd2)

# Print the results
for i, result in enumerate(chi_sq_result):
    print(f"Feature {i + 1}:")
    print(f"Chi-squared statistic: {result.statistic}")
    print(f"p-value: {result.pValue}")
    print(f"Degrees of freedom: {result.degreesOfFreedom}")
    print(f"Method: {result.method}\n")
