## __1. Setup Spark and load other libraries__

In [21]:
import pyspark 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
np.random.seed(60)

import pandas as pd
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [22]:
spark = pyspark.sql.SparkSession.builder \
    .appName("Crime_Classification") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
sc = spark.sparkContext

## __2. Data Extraction__

In [23]:
crime_dataset = pd.read_csv('train.csv')
crime_dataset['Latlong'] = crime_dataset['X']*crime_dataset['Y']
crime_dataset.drop(['X','Y','Dates'],axis='columns',inplace=True)

In [24]:
crime_dataset

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,Latlong
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-4624.588916
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-4624.588916
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-4627.691645
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-4627.847257
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-4624.699819
...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-4618.426865
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-4620.177499
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-4624.432596
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-4623.988577


In [25]:
new_data = crime_dataset.to_csv('preproccesing_data.csv', index=False)

# __3.Define Structure to build Pipeline__

In [26]:
df = spark.read.format('csv')\
          .option('header','true')\
          .option('inferSchema', 'true')\
          .option('timestamp', 'true')\
          .load('preproccesing_data.csv')


In [27]:
df.columns

['Category',
 'Descript',
 'DayOfWeek',
 'PdDistrict',
 'Resolution',
 'Address',
 'Latlong']

In [28]:
print('Dataframe Structure')
print('----------------------------------')
print(df.printSchema())
print(' ')
print('Dataframe preview')
print(df.show(5))
print(' ')
print('----------------------------------')
print('Total number of rows', df.count())

Dataframe Structure
----------------------------------
root
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- PdDistrict: string (nullable = true)
 |-- Resolution: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Latlong: double (nullable = true)

None
 
Dataframe preview
+--------------+--------------------+---------+----------+--------------+--------------------+------------------+
|      Category|            Descript|DayOfWeek|PdDistrict|    Resolution|             Address|           Latlong|
+--------------+--------------------+---------+----------+--------------+--------------------+------------------+
|      WARRANTS|      WARRANT ARREST|Wednesday|  NORTHERN|ARREST, BOOKED|  OAK ST / LAGUNA ST|-4624.588915745816|
|OTHER OFFENSES|TRAFFIC VIOLATION...|Wednesday|  NORTHERN|ARREST, BOOKED|  OAK ST / LAGUNA ST|-4624.588915745816|
|OTHER OFFENSES|TRAFFIC VIOLATION...|Wednesday|  NORTHERN|ARR

In [29]:
def top_n_list(df,name_column, N):
    print("Total number of unique value of"+' '+name_column+''+':'+' '+str(df.select(name_column).distinct().count()))
    print(' ')
    print('Top'+' '+str(N)+' '+'Crime'+' '+name_column)
    df.groupBy(name_column).count().withColumnRenamed('count','totalValue').orderBy(col('totalValue').desc()).show(N)
    
    
top_n_list(df, 'Resolution',12)


Total number of unique value of Resolution: 17
 
Top 12 Crime Resolution
+--------------------+----------+
|          Resolution|totalValue|
+--------------------+----------+
|                NONE|    526790|
|      ARREST, BOOKED|    206403|
|       ARREST, CITED|     77004|
|             LOCATED|     17101|
|   PSYCHOPATHIC CASE|     14534|
|           UNFOUNDED|      9585|
|     JUVENILE BOOKED|      5564|
|COMPLAINANT REFUS...|      3976|
|DISTRICT ATTORNEY...|      3934|
|      NOT PROSECUTED|      3714|
|      JUVENILE CITED|      3332|
|PROSECUTED BY OUT...|      2504|
+--------------------+----------+
only showing top 12 rows



## __4. Partition the dataset into Training and Test dataset__

In [30]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [31]:
# Creando una lista de columnas categóricas, excluyendo 'Category'
categorical_columns = ['Descript','DayOfWeek','PdDistrict','Resolution','Address']

In [32]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in categorical_columns]

encoder = OneHotEncoder(inputCols=[indexer.getOutputCol() for indexer in indexers],
                        outputCols=[column+"_ohe" for column in categorical_columns])

# Agregar un StringIndexer solo para 'Category'
category_indexer = StringIndexer(inputCol="Category", outputCol="Category_index").fit(df)

pipeline = Pipeline(stages=indexers + [encoder, category_indexer])


In [33]:
df = pipeline.fit(df).transform(df)

In [34]:
# Updating feature_columns
feature_columns = [column+"_ohe" for column in categorical_columns] + ['Latlong']

(train_data, test_data) = df.randomSplit([0.7, 0.3])

In [35]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

# Updating labelCol in LogisticRegression
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='Category_index', maxIter=10)

pipeline = Pipeline(stages=[assembler, scaler, lr])

In [36]:
model = pipeline.fit(train_data)

In [64]:
predictions = model.transform(test_data)
predictions = predictions.select('Category_index', 'prediction')
#predictions.select('Category_index', 'prediction')
predictions.show()

+--------------+----------+
|Category_index|prediction|
+--------------+----------+
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
|          27.0|      27.0|
+--------------+----------+
only showing top 20 rows



In [65]:
predictions.printSchema()

root
 |-- Category_index: double (nullable = false)
 |-- prediction: double (nullable = false)



In [66]:
evaluator = MulticlassClassificationEvaluator(labelCol="Category_index").setPredictionCol("prediction").evaluate(predictions)
print(' ')
print('--------------------------Accuracy-----------------------------')
print(' ')
print('               accuracy:{}'.format(evaluator))


 
--------------------------Accuracy-----------------------------
 
               accuracy:0.9968266523455473


In [79]:
from pyspark.mllib.evaluation import MulticlassMetrics
predictions.groupBy('Category_index', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = predictions.filter('prediction = 0 AND Category_index  = prediction').count()
TP = predictions.filter('prediction = 1 AND Category_index  = prediction').count()
FN = predictions.filter('prediction = 0 AND Category_index  = 1').count()
FP = predictions.filter('prediction = 1 AND Category_index  = 0').count()

+--------------+----------+-----+
|Category_index|prediction|count|
+--------------+----------+-----+
|          22.0|       5.0|  234|
|          36.0|      36.0|   42|
|           7.0|       7.0|12695|
|          35.0|      35.0|   39|
|          12.0|      12.0| 4959|
|           3.0|      23.0|   90|
|           1.0|       1.0|37641|
|          31.0|      31.0|  134|
|          17.0|       1.0|    1|
|          10.0|      10.0| 7813|
|          28.0|      28.0|  377|
|          14.0|      14.0| 3052|
|          27.0|      27.0|  441|
|          21.0|      21.0| 1245|
|          17.0|      17.0| 2205|
|          26.0|      26.0|  556|
|           2.0|       2.0|27892|
|          24.0|      24.0|  644|
|          23.0|      23.0|  683|
|          19.0|      19.0| 1272|
+--------------+----------+-----+
only showing top 20 rows



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `label` cannot be resolved. Did you mean one of the following? [`prediction`, `Category_index`].; line 1 pos 19;
'Filter ((prediction#2848 = cast(0 as double)) AND ('label = prediction#2848))
+- Project [Category_index#1877, prediction#2848]
   +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, Category_index#1877, features#2745, scaledFeatures#2768, rawPrediction#2794, probability#2819, UDF(rawPrediction#2794) AS prediction#2848]
      +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, Category_index#1877, features#2745, scaledFeatures#2768, rawPrediction#2794, UDF(rawPrediction#2794) AS probability#2819]
         +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, Category_index#1877, features#2745, scaledFeatures#2768, UDF(scaledFeatures#2768) AS rawPrediction#2794]
            +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, Category_index#1877, features#2745, UDF(features#2745) AS scaledFeatures#2768]
               +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, Category_index#1877, UDF(struct(Descript_ohe, Descript_ohe#1822, DayOfWeek_ohe, DayOfWeek_ohe#1823, PdDistrict_ohe, PdDistrict_ohe#1824, Resolution_ohe, Resolution_ohe#1825, Address_ohe, Address_ohe#1826, Latlong, Latlong#1211)) AS features#2745]
                  +- Sample 0.7, 1.0, false, 7230665535325484349
                     +- Sort [Category#1205 ASC NULLS FIRST, Descript#1206 ASC NULLS FIRST, DayOfWeek#1207 ASC NULLS FIRST, PdDistrict#1208 ASC NULLS FIRST, Resolution#1209 ASC NULLS FIRST, Address#1210 ASC NULLS FIRST, Latlong#1211 ASC NULLS FIRST, Descript_index#1720 ASC NULLS FIRST, DayOfWeek_index#1735 ASC NULLS FIRST, PdDistrict_index#1751 ASC NULLS FIRST, Resolution_index#1768 ASC NULLS FIRST, Address_index#1786 ASC NULLS FIRST, Descript_ohe#1822 ASC NULLS FIRST, DayOfWeek_ohe#1823 ASC NULLS FIRST, PdDistrict_ohe#1824 ASC NULLS FIRST, Resolution_ohe#1825 ASC NULLS FIRST, Address_ohe#1826 ASC NULLS FIRST, Category_index#1877 ASC NULLS FIRST], false
                        +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, Descript_ohe#1822, DayOfWeek_ohe#1823, PdDistrict_ohe#1824, Resolution_ohe#1825, Address_ohe#1826, UDF(cast(Category#1205 as string)) AS Category_index#1877]
                           +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, Address_index#1786, UDF(cast(Descript_index#1720 as double), 0) AS Descript_ohe#1822, UDF(cast(DayOfWeek_index#1735 as double), 1) AS DayOfWeek_ohe#1823, UDF(cast(PdDistrict_index#1751 as double), 2) AS PdDistrict_ohe#1824, UDF(cast(Resolution_index#1768 as double), 3) AS Resolution_ohe#1825, UDF(cast(Address_index#1786 as double), 4) AS Address_ohe#1826]
                              +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, Resolution_index#1768, UDF(cast(Address#1210 as string)) AS Address_index#1786]
                                 +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, PdDistrict_index#1751, UDF(cast(Resolution#1209 as string)) AS Resolution_index#1768]
                                    +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, DayOfWeek_index#1735, UDF(cast(PdDistrict#1208 as string)) AS PdDistrict_index#1751]
                                       +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, Descript_index#1720, UDF(cast(DayOfWeek#1207 as string)) AS DayOfWeek_index#1735]
                                          +- Project [Category#1205, Descript#1206, DayOfWeek#1207, PdDistrict#1208, Resolution#1209, Address#1210, Latlong#1211, UDF(cast(Descript#1206 as string)) AS Descript_index#1720]
                                             +- Relation [Category#1205,Descript#1206,DayOfWeek#1207,PdDistrict#1208,Resolution#1209,Address#1210,Latlong#1211] csv


In [67]:
metrics = MulticlassMetrics(predictionAndLabels)

# Confusion Matrix
cm = metrics.confusionMatrix().toArray()

# True Positives
TP = cm[0][0]

# True Negatives
TN = cm[1][1]

# False Positives
FP = cm[0][1]

# False Negatives
FN = cm[1][0]

# Print out the results
print(f"True Positives: {TP}")
print(f"True Negatives: {TN}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")




Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 149.0 failed 1 times, most recent failure: Lost task 0.0 in stage 149.0 (TID 526) (DESKTOP-AG0MV7U executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python310\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 683, in main
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:561)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:767)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:749)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:179)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:179)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:76)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python310\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 683, in main
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:561)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:767)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:749)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:179)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
