In [13]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.ml.classification import GBTClassifier, NaiveBayes, DecisionTreeClassifier, LinearSVC
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName("example").getOrCreate()

your 131072x1 screen size is bogus. expect trouble
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/27 14:00:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/27 14:00:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
hdfs_path = "hdfs://localhost:9000/user/aniss/events.csv"

# Load the CSV file into a DataFrame
events = spark.read.csv(hdfs_path, header=True, inferSchema=True)

In [4]:
FEATURES = ['other_pp','from_fk','from_ti','from_corner','from_counter','from_gk','from_keeper','from_ko',
            'header','corner_type','fk_type','pk_type',
            'half_volley_technique','volley_technique','lob_technique','overhead_technique','backheel_technique',
            'diving_h_technique',
            'distance_to_goal', 'shot_angle', 'preferred_foot_shot', 'under_pressure',
            'shot_aerial_won','shot_first_time','shot_one_on_one','shot_open_goal','shot_follows_dribble','players_inside_area']

In [26]:
import xG_preprocessing as pp

In [27]:
train_data, test_data = pp.pre_training(df)

### Logistic Regression

In [28]:
lr = LogisticRegression(labelCol="goal", featuresCol="features_vector")

In [29]:
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())


In [32]:
evaluator = BinaryClassificationEvaluator(labelCol="goal", metricName="areaUnderROC")
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

In [33]:
cvModel = crossval.fit(train_data)
predictions = cvModel.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy = {accuracy}")

25/01/27 14:12:12 WARN CacheManager: Asked to cache already cached data.
25/01/27 14:12:12 WARN CacheManager: Asked to cache already cached data.


Test set accuracy = 0.8045916546331364


In [36]:
train_data, test_data = pp.pre_training(df)

In [41]:
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="goal", metricName="accuracy")
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=accuracy_evaluator,
                          numFolds=5)

In [43]:
cvModel = crossval.fit(train_data)
predictions = cvModel.transform(test_data)
accuracy = accuracy_evaluator.evaluate(predictions)
print(f"Test set accuracy = {accuracy}")

Test set accuracy = 0.9011929247223365


In [None]:
def tune_hyperparameters(self, models=['logistic'], param_grids=None, evaluator=None, num_folds=3, process=False, time=False):
        """
        Perform hyperparameter tuning using CrossValidator for one or more models and return the results as a DataFrame.
        
        :param models: List of model names (str) to tune. Defaults to ['logistic'].
        :param param_grids: Dictionary containing parameter grids for each model. If None, defaults to pre-defined grids.
        :param evaluator: Evaluator to use for the validation. Defaults to BinaryClassificationEvaluator.
        :param num_folds: Number of folds for cross-validation. Default is 3.
        :param process: Boolean flag to control whether to print the process during tuning. Defaults to False.
        :param time: Boolean flag to control whether to print the time taken for tuning. Defaults to False.
        :return: DataFrame with columns ['model', 'params_dict', 'score'] sorted by score (highest to lowest).
        """
            
        if param_grids is None:
            # Define default hyperparameter grids for each model
            param_grids = {
                'logistic': ParamGridBuilder().addGrid(LogisticRegression.regParam, [0.0, 0.1, 0.2]).addGrid(LogisticRegression.maxIter, [10, 50, 100]).build(),
                'rf': ParamGridBuilder().addGrid(RandomForestClassifier.numTrees, [50, 100, 200]).addGrid(RandomForestClassifier.maxDepth, [5, 10, 15]).build(),
                'mlp': ParamGridBuilder().addGrid(MultilayerPerceptronClassifier.maxIter, [50, 100]).addGrid(MultilayerPerceptronClassifier.layers, [[len(self.FEATURES), 10, 2], [len(self.FEATURES), 20, 2]]).build(),
                'gbt': ParamGridBuilder().addGrid(GBTClassifier.maxIter, [50, 100]).addGrid(GBTClassifier.maxDepth, [5, 10]).build(),
                'nb': ParamGridBuilder().addGrid(NaiveBayes.smoothing, [1.0, 1.5, 2.0]).build(),
                'dt': ParamGridBuilder().addGrid(DecisionTreeClassifier.maxDepth, [5, 10, 15]).addGrid(DecisionTreeClassifier.minInstancesPerNode, [1, 2, 3]).build(),
                'svm': ParamGridBuilder().addGrid(LinearSVC.regParam, [0.0, 0.1, 0.2]).addGrid(LinearSVC.maxIter, [50, 100]).build()
            }

        if evaluator is None:
            evaluator = BinaryClassificationEvaluator(labelCol=self.label_col, rawPredictionCol="rawPrediction")

        results = []

        for model_name in models:
            model = self.initialize_model()  # Initialize the model as per the selected model
            param_grid = param_grids[model_name]

            if process:
                print(f"Tuning hyperparameters for {model_name}...")

            # Perform cross-validation
            cross_val = CrossValidator(estimator=model, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=num_folds)
            cv_model = cross_val.fit(self.train_data)

            # Collect results for each parameter set
            for i, params in enumerate(cv_model.getEstimatorParamMaps()):
                model_score = evaluator.evaluate(cv_model.transform(self.test_data))

                # Convert the ParamGrid to a clean dictionary for better readability
                params_dict = {param.name: value for param, value in params.items()}

                # Store the result for this parameter set
                results.append({
                    'model': model_name,
                    'params_dict': params_dict,
                    'score': model_score
                })

                if process:
                    print(f"Iteration {i+1}: Score for params {params_dict} is {model_score:.4f}")

        # Convert the results to a DataFrame and sort by score in descending order
        results_df = pd.DataFrame(results)
        results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)

        if process:
            print("Hyperparameter tuning completed.")

        return results_df
