In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("IBM_HR_Attrition").getOrCreate()

25/06/09 21:46:09 WARN Utils: Your hostname, Molphie resolves to a loopback address: 127.0.1.1; using 192.168.6.223 instead (on interface enp4s0)
25/06/09 21:46:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/09 21:46:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data = spark.read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", header=True, inferSchema=True)

In [3]:
attrition_indexer = StringIndexer(inputCol="Attrition", outputCol="label")

categorical_cols = [field for (field, dtype) in data.dtypes if dtype == "string" and field != "Attrition"]
numeric_cols = [field for (field, dtype) in data.dtypes if ((dtype == "double") or (dtype == "int")) and field != "EmployeeNumber"]

indexers = [StringIndexer(inputCol=column, outputCol=column + "_indexed") for column in categorical_cols]

assembler_inputs = [col + "_indexed" for col in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [4]:
pipeline = Pipeline(stages=[attrition_indexer] + indexers + [assembler])
model = pipeline.fit(data)
dataset = model.transform(data)

In [5]:
train_data, test_data = dataset.randomSplit([0.7, 0.3], seed=42)

# Классификация: предсказание увольнения сотрудников.

In [6]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, seed=42)
rf_model = rf.fit(train_data)

In [7]:
predictions = rf_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 0.8546
F1 Score: 0.8090


# Кластеризация: сегментация сотрудников по характеристикам.


In [8]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [9]:
kmeans = KMeans(featuresCol="features", k=3, seed=42)

kmeans_model = kmeans.fit(dataset)

25/06/09 21:46:20 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
cluster_predictions = kmeans_model.transform(dataset)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(cluster_predictions)

print(f"Silhouette Score: {silhouette:.4f}")

Silhouette Score: 0.6757


In [11]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Row

In [12]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=50)
lr_model = lr.fit(train_data)

lr_predictions = lr_model.transform(test_data)

In [13]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(lr_predictions)

f1_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_eval.evaluate(lr_predictions)

binary_eval = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = binary_eval.evaluate(lr_predictions)

print(f"[Logistic Regression] Accuracy: {accuracy:.4f}")
print(f"[Logistic Regression] F1 Score: {f1:.4f}")
print(f"[Logistic Regression] ROC AUC: {roc_auc:.4f}")

[Logistic Regression] Accuracy: 0.8750
[Logistic Regression] F1 Score: 0.8553
[Logistic Regression] ROC AUC: 0.7999


In [14]:
coef_array = lr_model.coefficients.toArray().tolist()
coef_rows = [Row(feature=f, coefficient=c, abs_coeff=abs(c)) for f, c in zip(assembler_inputs, coef_array)]
coef_sdf = spark.createDataFrame(coef_rows)

print("\n[Top 10 Most Influential Features]")
coef_sdf.orderBy("abs_coeff", ascending=False).show(10, truncate=False)


[Top 10 Most Influential Features]
+------------------------+--------------------+-------------------+
|feature                 |coefficient         |abs_coeff          |
+------------------------+--------------------+-------------------+
|OverTime_indexed        |1.8828789117673954  |1.8828789117673954 |
|Department_indexed      |0.5684795046190597  |0.5684795046190597 |
|JobInvolvement          |-0.5473082779157762 |0.5473082779157762 |
|StockOptionLevel        |-0.47789011787045166|0.47789011787045166|
|Gender_indexed          |-0.47154221174564753|0.47154221174564753|
|EnvironmentSatisfaction |-0.43561554380107337|0.43561554380107337|
|JobLevel                |-0.41371868389716754|0.41371868389716754|
|JobSatisfaction         |-0.33322899696410957|0.33322899696410957|
|RelationshipSatisfaction|-0.2622509130405401 |0.2622509130405401 |
|NumCompaniesWorked      |0.17351227663856217 |0.17351227663856217|
+------------------------+--------------------+-------------------+
only showing

                                                                                