# Advertisements Dataset

## --------- Parallel Approach ---------

In [0]:
from pyspark.sql import SparkSession

#Spark Context initialization
spark = SparkSession.builder.appName("FeatureSelection").getOrCreate()

# Reading the dataset files
df = spark.read.format('csv').option("header", 'true').option("inferSchema", 'true').load("/mnt/team22/preprocessed_Ads_dataset.csv")
spark.conf.set("spark.sql.shuffle.partitions", 20)

df.cache()
df.limit(5).toPandas().head()

In [0]:
df.printSchema()

In [0]:
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from time import time

# Specify the input columns (excluding the target column)
feature_cols = df.columns[:-1]

# Assemble the input columns into a feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

feature_df = assembler.transform(df).select("features", "target")
# We need to cache feature_df as we will fit it multiple times using different methods
feature_df.cache()

### Filter Method: Chi-Square Parallel Approach

In [0]:
# Create a ChiSqSelector object to select the top 10 features
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures_CHI", labelCol="target")
start_time = time()
selected_features_target_CHI = selector.fit(feature_df).transform(
    feature_df).select(["selectedFeatures_CHI", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_CHI.show(5)

Fitting/Transforming time: 4.100317001342773
+--------------------+------+
|selectedFeatures_CHI|target|
+--------------------+------+
|          (10,[],[])|     0|
|          (10,[],[])|     0|
|      (10,[5],[1.0])|     0|
|      (10,[5],[1.0])|     0|
|      (10,[5],[1.0])|     0|
+--------------------+------+
only showing top 5 rows



### Wrapper Method: PCA Parallel Approach

In [0]:
from pyspark.ml.feature import PCA

# Create PCA for feature selection
pca_selector = PCA(k=10, inputCol="features",
                   outputCol="selectedFeatures_PCA")

start_time = time()
selected_features_target_PCA = pca_selector.fit(feature_df).transform(
    feature_df).select(["selectedFeatures_PCA", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_PCA.show(5)


Fitting/Transforming time: 20.164050579071045
+--------------------+------+
|selectedFeatures_PCA|target|
+--------------------+------+
|[-0.1155583175626...|     0|
|[-0.0592562360850...|     0|
|[-0.0198863052710...|     0|
|[-0.0450834921910...|     0|
|[-0.0075856550934...|     0|
+--------------------+------+
only showing top 5 rows



### Embedded Method: Decision Tree Parallel Approach

In [0]:
from pyspark.ml.feature import PCA
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorSlicer

# Create Decision Tree for feature selection
dt_selector = DecisionTreeClassifier(
    maxDepth=10, featuresCol="features", labelCol="target", seed=0)

# Fit the model and select top 10 features
start_time = time()
selected_features_target_DT = dt_selector.fit(feature_df)
dt_selector = VectorSlicer(inputCol="features", outputCol="selectedFeatures_DT", indices=selected_features_target_DT.featureImportances.indices.tolist()[:10])
selected_features_target_DT = dt_selector.transform(feature_df).select(["selectedFeatures_DT", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_DT.show(5)


Fitting/Transforming time: 18.768101930618286
+-------------------+------+
|selectedFeatures_DT|target|
+-------------------+------+
|         (10,[],[])|     0|
|         (10,[],[])|     0|
|         (10,[],[])|     0|
|         (10,[],[])|     0|
|         (10,[],[])|     0|
+-------------------+------+
only showing top 5 rows



## --------- Sequential Approach ---------

In [0]:
import pandas as pd
# pyspark is used to load the file first then it's transformed into a pandas dataframe because we cannot read the csv file directly using pd.read_csv('/mnt/team22/preprocessed_Ads_dataset.csv')
df_seq = spark.read.format('csv').option("header", 'true').load("/mnt/team22/preprocessed_Ads_dataset.csv").toPandas()

df_seq.head()

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,...,1519,1520,1521,1522,1523,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535,1536,1537,1538,1539,1540,1541,1542,1543,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Filter Method: Chi-Square Sequential Approach

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from time import time

# Define input features and output variable
X = df_seq.drop('target', axis=1)
y = df_seq['target']

# Create ChiSqSelector for feature selection
chi2_selector = SelectKBest(chi2, k=10)
start_time = time()
X_f = chi2_selector.fit_transform(X, y)
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

# Get the selected feature indices
selected_feature_indices = chi2_selector.get_support(indices=True)

# Retrieve the selected feature names
selected_feature_names = X.columns[selected_feature_indices]

print("Selected features:", selected_feature_names)


Fitting/Transforming time: 0.7135171890258789
Selected features: Index(['351', '968', '1143', '1153', '1243', '1344', '1399', '1435', '1455',
       '1483'],
      dtype='object')


### Wrapper Method: PCA Sequential Approach

In [0]:
from sklearn.decomposition import TruncatedSVD

# Create PCA for feature selection
pca_selector = TruncatedSVD(n_components=10)
start_time = time()
X_w = pca_selector.fit_transform(X, y)
print('PCA fitting time: ', time() - start_time)

PCA fitting time:  1.1810657978057861


### Embedded Method: Decision Tree Sequential Approach

In [0]:
from sklearn.tree import DecisionTreeClassifier as DecisionTreeClassifier_sk

# Create Decision Tree for feature selection
dt = DecisionTreeClassifier_sk(max_depth=10, random_state=0)
start_time = time()
dt.fit(X, y)
imp_scores = pd.Series(dt.feature_importances_)
#top_features = imp_scores.nlargest(10).index
#X_e = X[:, top_features]
print('Decision Tree fitting time: ', time() - start_time)

Decision Tree fitting time:  0.7982912063598633


### Training parallel models

In [0]:
# CONSTANT parameters to use for both parallel and sequential Classifiers
num_trees = 100
max_depth = 5
min_samples_split = 2
min_samples_leaf = 1
max_features = "10"
seed_v = 123

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import time

output_col = 'target'
# Define the evaluation properties
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol=output_col, predictionCol='prediction')

# Define an empty search space
param_grid = ParamGridBuilder().build()

# Create RandomForestClassifier for classification
rf_f = RandomForestClassifier(featuresCol='selectedFeatures_CHI', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)
rf_w = RandomForestClassifier(featuresCol='selectedFeatures_PCA', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)
rf_e = RandomForestClassifier(featuresCol='selectedFeatures_DT', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)

# Define the cross validator
cv_f = CrossValidator(estimator=rf_f, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
cv_w = CrossValidator(estimator=rf_w, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
cv_e = CrossValidator(estimator=rf_e, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Train the model
cv_model_f = cv_f.fit(selected_features_target_CHI)
cv_model_w = cv_w.fit(selected_features_target_PCA)
cv_model_e = cv_e.fit(selected_features_target_DT)

# Print the average performance
print("Chi-Square Accuracy: ", np.mean(cv_model_f.avgMetrics))
print("Chi-Square Precision: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "weightedPrecision"}))
print("Chi-Square Recall: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "weightedRecall"}))
print("Chi-Square F1 score: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "f1"}))

print("PCA Accuracy: ", np.mean(cv_model_w.avgMetrics))
print("PCA Precision: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "weightedPrecision"}))
print("PCA Recall: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "weightedRecall"}))
print("PCA F1 score: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "f1"}))

print("Decision Tree Accuracy: ", np.mean(cv_model_e.avgMetrics))
print("Decision Tree Precision: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "weightedPrecision"}))
print("Decision Tree Recall: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "weightedRecall"}))
print("Decision Tree F1 score: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "f1"}))

Chi-Square Accuracy:  0.8853150982833995
Chi-Square Precision:  0.8919520683571844
Chi-Square Recall:  0.889295516925892
Chi-Square F1 score:  0.8594499956807091
PCA Accuracy:  0.945279556190958
PCA Precision:  0.947345402786504
PCA Recall:  0.9469350411710887
PCA F1 score:  0.9425867432515805
Decision Tree Accuracy:  0.9075877183536363
Decision Tree Precision:  0.9088024817589206
Decision Tree Recall:  0.9085086916742909
Decision Tree F1 score:  0.8919059232993891


### Training sequential models

In [0]:
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifier_sk
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Create RandomForestClassifier for classification
rf_f = RandomForestClassifier_sk(random_state=0)
rf_w = RandomForestClassifier_sk(random_state=0)
rf_e = RandomForestClassifier_sk(random_state=0)

# Fit the models
rf_f.fit(X_f, y)
rf_w.fit(X_w, y)
#rf_e.fit(X_e, y)

# Calculate the metrics
precision_f, recall_f, f1_f, _ = precision_recall_fscore_support(y, rf_f.predict(X_f), average='weighted')
print("Chi-Square Precision: ", precision_f)
print("Chi-Square Recall: ", recall_f)
print("Chi-Square F1-score: ", f1_f)

precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(y, rf_w.predict(X_w), average='weighted')
print("PCA Precision: ", precision_w)
print("PCA Recall: ", recall_w)
print("PCA F1-score: ", f1_w)

#precision_e, recall_e, f1_e, _ = precision_recall_fscore_support(y, rf_e.predict(X_e), average='weighted')
#print("Decision Tree Precision: ", precision_e)
#print("Decision Tree Recall: ", recall_e)
#print("Decision Tree F1-score: ", f1_e)



Chi-Square Precision:  0.9443878321949887
Chi-Square Recall:  0.945410186032327
Chi-Square F1-score:  0.9416289022122376
PCA Precision:  0.9938885068800959
PCA Recall:  0.9939005794449527
PCA F1-score:  0.9938666609000851


# CovType Dataset

## --------- Parallel Approach ---------

In [0]:
from pyspark.sql import SparkSession

#Spark Context initialization
spark = SparkSession.builder.appName("FeatureSelection").getOrCreate()

df = spark.read.format('csv').option("header", 'true').option("inferSchema", 'true').load("/mnt/team22/covtype.csv")
#spark.conf.set("spark.sql.shuffle.partitions", 4)

df.cache()
df.limit(5).toPandas().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,target
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [0]:
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from time import time

# Extract feature columns
feature_cols = df.columns[:-1]

# Assemble the input columns into a feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

feature_df = assembler.transform(df).select("features", "target")
# We need to cache feature_df as we will fit it multiple times using different methods
feature_df.cache()

Out[16]: DataFrame[features: vector, target: int]

### Filter Method: Chi-Square Parallel Approach

In [0]:
# Create a ChiSqSelector object to select the top 10 features
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures_CHI", labelCol="target")
start_time = time()
selected_features_target_CHI = selector.fit(feature_df).transform(
    feature_df).select(["selectedFeatures_CHI", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_CHI.limit(5).toPandas().head()

Fitting/Transforming time: 5.095085144042969


Unnamed: 0,selectedFeatures_CHI,target
0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5
1,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5
2,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
3,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
4,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5


### Wrapper Method: PCA Parallel Approach

In [0]:
from pyspark.ml.feature import PCA

# Create PCA for feature selection
pca_selector = PCA(k=10, inputCol="features",
                   outputCol="selectedFeatures_PCA")

start_time = time()
selected_features_target_PCA = pca_selector.fit(feature_df).transform(
    feature_df).select(["selectedFeatures_PCA", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_PCA.show(5)


Fitting/Transforming time: 4.644470930099487
+--------------------+------+
|selectedFeatures_PCA|target|
+--------------------+------+
|[-1.0089296336700...|     5|
|[-1.0089296336700...|     5|
|[-0.7159367723837...|     2|
|[-0.7164087325746...|     2|
|[-1.0089296336700...|     5|
+--------------------+------+
only showing top 5 rows



### Embedded Method: Decision Tree Parallel Approach

In [0]:
from pyspark.ml.feature import PCA
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorSlicer
from time import time

# Create Decision Tree for feature selection
dt_selector = DecisionTreeClassifier(
    maxDepth=10, featuresCol="features", labelCol="target", seed=0)

# Fit the model and select top 10 features
start_time = time()
selected_features_target_DT = dt_selector.fit(feature_df)
dt_selector = VectorSlicer(inputCol="features", outputCol="selectedFeatures_DT", indices=selected_features_target_DT.featureImportances.indices.tolist()[:10])
selected_features_target_DT = dt_selector.transform(feature_df).select(["selectedFeatures_DT", "target"])
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

selected_features_target_DT.show(5)


Fitting/Transforming time: 5.014081001281738
+--------------------+------+
| selectedFeatures_DT|target|
+--------------------+------+
|      (10,[0],[1.0])|     5|
|      (10,[0],[1.0])|     5|
|(10,[0,9],[1.0,1.0])|     2|
|      (10,[0],[1.0])|     2|
|      (10,[0],[1.0])|     5|
+--------------------+------+
only showing top 5 rows



## --------- Sequential Approach ---------

In [0]:
import pandas as pd
df_seq = spark.read.format('csv').option("header", 'true').load("/mnt/team22/covtype.csv").toPandas()
df_seq.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,target
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


### Filter Method: Chi-Square Sequential Approach

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from time import time

# Define input features and output variable
X = df_seq.drop('target', axis=1)
y = df_seq['target']

# Create ChiSqSelector for feature selection
chi2_selector = SelectKBest(chi2, k=10)
start_time = time()
X_f = chi2_selector.fit_transform(X, y)
fit_time = time() - start_time
print("Fitting/Transforming time:", fit_time)

# Get the selected feature indices
selected_feature_indices = chi2_selector.get_support(indices=True)

# Retrieve the selected feature names
selected_feature_names = X.columns[selected_feature_indices]

print("Selected features:", selected_feature_names)


Fitting/Transforming time: 5.199588298797607
Selected features: Index(['0', '3', '5', '6', '7', '9', '13', '41', '42', '43'], dtype='object')


### Wrapper Method: PCA Sequential Approach

In [0]:
from sklearn.decomposition import TruncatedSVD
from time import time

# Create PCA for feature selection
pca_selector = TruncatedSVD(n_components=10)
start_time = time()
X_w = pca_selector.fit_transform(X, y)
print('PCA fitting time: ', time() - start_time)

PCA fitting time:  8.272578477859497


### Embedded Method: Decision Tree Sequential Approach

In [0]:
from sklearn.tree import DecisionTreeClassifier as DecisionTreeClassifier_sk

# Create Decision Tree for feature selection
dt = DecisionTreeClassifier_sk(max_depth=10, random_state=0)
start_time = time()
dt.fit(X, y)
imp_scores = pd.Series(dt.feature_importances_)
top_features = imp_scores.nlargest(10).index
X_e = X.iloc[:, top_features]
print('Decision Tree fitting time: ', time() - start_time)

Decision Tree fitting time:  7.075010776519775


### Training parallel models

In [0]:
# CONSTANT parameters to use for both parallel and sequential Classifiers
num_trees = 100
max_depth = 5
min_samples_split = 2
min_samples_leaf = 1
max_features = "10"
seed_v = 123

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import time

output_col = 'target'
# Define the evaluation properties
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol=output_col, predictionCol='prediction')

# Define an empty search space
param_grid = ParamGridBuilder().build()

# Create RandomForestClassifier for classification
rf_f = RandomForestClassifier(featuresCol='selectedFeatures_CHI', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)
rf_w = RandomForestClassifier(featuresCol='selectedFeatures_PCA', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)
rf_e = RandomForestClassifier(featuresCol='selectedFeatures_DT', numTrees=num_trees, maxDepth=max_depth, labelCol=output_col, seed=seed_v)

# Define the cross validator
cv_f = CrossValidator(estimator=rf_f, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
cv_w = CrossValidator(estimator=rf_w, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
cv_e = CrossValidator(estimator=rf_e, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Train the model
cv_model_f = cv_f.fit(selected_features_target_CHI)
cv_model_w = cv_w.fit(selected_features_target_PCA)
cv_model_e = cv_e.fit(selected_features_target_DT)

# Print the average performance
print("Chi-Square Accuracy: ", np.mean(cv_model_f.avgMetrics))
print("Chi-Square Precision: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "weightedPrecision"}))
print("Chi-Square Recall: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "weightedRecall"}))
print("Chi-Square F1 score: ", evaluator.evaluate(cv_model_f.transform(selected_features_target_CHI), {evaluator.metricName: "f1"}))

print("PCA Accuracy: ", np.mean(cv_model_w.avgMetrics))
print("PCA Precision: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "weightedPrecision"}))
print("PCA Recall: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "weightedRecall"}))
print("PCA F1 score: ", evaluator.evaluate(cv_model_w.transform(selected_features_target_PCA), {evaluator.metricName: "f1"}))

print("Decision Tree Accuracy: ", np.mean(cv_model_e.avgMetrics))
print("Decision Tree Precision: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "weightedPrecision"}))
print("Decision Tree Recall: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "weightedRecall"}))
print("Decision Tree F1 score: ", evaluator.evaluate(cv_model_e.transform(selected_features_target_DT), {evaluator.metricName: "f1"}))

Chi-Square Accuracy:  0.5468080177323286
Chi-Square Precision:  0.5253619616057763
Chi-Square Recall:  0.5468079833118765
Chi-Square F1 score:  0.43374478742334915
PCA Accuracy:  0.6176013756664399
PCA Precision:  0.5575185138163695
PCA Recall:  0.6123384714945647
PCA F1 score:  0.5777748892152771
Decision Tree Accuracy:  0.5401389870430267
Decision Tree Precision:  0.5253619616057763
Decision Tree Recall:  0.5468079833118765
Decision Tree F1 score:  0.43374478742334915


### Training sequential models

In [0]:
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifier_sk
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Create RandomForestClassifier for classification
rf_f = RandomForestClassifier_sk(random_state=0)
rf_w = RandomForestClassifier_sk(random_state=0)
rf_e = RandomForestClassifier_sk(random_state=0)

# Fit the models
rf_f.fit(X_f, y)
rf_w.fit(X_w, y)
#rf_e.fit(X_e, y)

# Calculate the metrics
precision_f, recall_f, f1_f, _ = precision_recall_fscore_support(y, rf_f.predict(X_f), average='weighted')
print("Chi-Square Precision: ", precision_f)
print("Chi-Square Recall: ", recall_f)
print("Chi-Square F1-score: ", f1_f)

precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(y, rf_w.predict(X_w), average='weighted')
print("PCA Precision: ", precision_w)
print("PCA Recall: ", recall_w)
print("PCA F1-score: ", f1_w)

#precision_e, recall_e, f1_e, _ = precision_recall_fscore_support(y, rf_e.predict(X_e), average='weighted')
#print("Decision Tree Precision: ", precision_e)
#print("Decision Tree Recall: ", recall_e)
#print("Decision Tree F1-score: ", f1_e)



  _warn_prf(average, modifier, msg_start, len(result))


Chi-Square Precision:  0.6257432367878596
Chi-Square Recall:  0.5742635263987663
Chi-Square F1-score:  0.4593477930267492
PCA Precision:  0.6412525800247537
PCA Recall:  0.6508075564704343
PCA F1-score:  0.6252187885251452


  _warn_prf(average, modifier, msg_start, len(result))
