In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [94]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [23]:
# Load the csv into a dataframe
churn_df = spark.read.csv("churn.csv", header=True, inferSchema=True)
churn_df

_c0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,1
3,Phillip White,42.0,8010.76,0,6.71,10.0,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,1
5,Jessica Williams,48.0,10356.02,0,5.12,8.0,1
6,Eric Butler,44.0,11331.58,1,5.23,11.0,1
7,Zachary Walsh,32.0,9885.12,1,6.92,9.0,1
8,Ashlee Carr,43.0,14062.6,1,5.46,11.0,1
9,Jennifer Lynch,40.0,8066.94,1,7.11,11.0,1


In [24]:
df=churn_df.withColumnRenamed("_c0","Index")

In [25]:

df.printSchema()


root
 |-- Index: integer (nullable = true)
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)



In [26]:
df.limit(5)

Index,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,1
3,Phillip White,42.0,8010.76,0,6.71,10.0,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,1


In [27]:
df.describe().show()

+-------+------------------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|summary|             Index|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|              Churn|
+-------+------------------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+
|  count|               900|          900|              900|              900|               900|              900|               900|                900|
|   mean|             449.5|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|0.16666666666666666|
| stddev|259.95191863111916|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| 0.3728852122772358|
|    min|                 0|   Aaron King|             22.0|          

In [28]:
df.columns

['Index',
 'Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Churn']

In [29]:
from pyspark.ml.feature import VectorAssembler

In [30]:
assembler=VectorAssembler(inputCols=['Age','Total_Purchase','Account_Manager','Years','Num_Sites'],outputCol="features") #numarical columns combined into a column as feature

In [32]:
output=assembler.transform(df)

In [33]:
df_final=output.select("features","churn")

In [34]:
df_final.show()

+--------------------+-----+
|            features|churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
|[37.0,9191.58,0.0...|    1|
|[48.0,10356.02,0....|    1|
|[44.0,11331.58,1....|    1|
|[32.0,9885.12,1.0...|    1|
|[43.0,14062.6,1.0...|    1|
|[40.0,8066.94,1.0...|    1|
|[30.0,11575.37,1....|    1|
|[45.0,8771.02,1.0...|    1|
|[45.0,8988.67,1.0...|    1|
|[40.0,8283.32,1.0...|    1|
|[41.0,6569.87,1.0...|    1|
|[38.0,10494.82,1....|    1|
|[45.0,8213.41,1.0...|    1|
|[43.0,11226.88,0....|    1|
|[53.0,5515.09,0.0...|    1|
|[46.0,8046.4,1.0,...|    1|
+--------------------+-----+
only showing top 20 rows



In [35]:
train,test=df_final.randomSplit([0.7,0.3],seed=42)

In [36]:
from pyspark.ml.classification import LogisticRegression

In [37]:
lr=LogisticRegression(labelCol="churn")

In [38]:
lrm=lr.fit(train)

In [39]:
lrm_summary=lrm.summary

In [40]:
lrm_summary.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|  0.0|[4.55979933582600...|[0.98964420615385...|       0.0|
|[25.0,9672.03,0.0...|  0.0|[4.67536684163721...|[0.99076399423917...|       0.0|
|[26.0,8939.61,0.0...|  0.0|[6.28230375013810...|[0.99813439726403...|       0.0|
|[27.0,8628.8,1.0,...|  0.0|[5.32554193456119...|[0.99515784712679...|       0.0|
|[28.0,8670.98,0.0...|  0.0|[7.59026142971801...|[0.99949490632507...|       0.0|
|[28.0,11128.95,1....|  0.0|[4.09748998252342...|[0.98365719925299...|       0.0|
|[29.0,5900.78,1.0...|  0.0|[4.06733654772172...|[0.98316532508264...|       0.0|
|[29.0,8688.17,1.0...|  1.0|[2.71962043931940...|[0.93817452170582...|       0.0|
|[29.0,9378.24,0.0...|  0.0|[4.73007501034927...|[0.99125140444539...|       0.0|
|[29.0,12711.15,

In [41]:
lrm_summary.predictions.describe().show()

+-------+------------------+-------------------+
|summary|             churn|         prediction|
+-------+------------------+-------------------+
|  count|               667|                667|
|   mean|0.1634182908545727|0.12293853073463268|
| stddev|0.3700243606477147|0.32861306618408714|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+



In [42]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [43]:
pred_labels=lrm.evaluate(test)

In [44]:
pred_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.79106193949545...|[0.68805930409057...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.61026634613434...|[0.83344836179841...|       0.0|
|[28.0,11204.23,0....|    0|[1.97148327271884...|[0.87777034205971...|       0.0|
|[28.0,11245.38,0....|    0|[3.75330942021012...|[0.97709680745324...|       0.0|
|[29.0,9617.59,0.0...|    0|[4.42202740353912...|[0.98813266624674...|       0.0|
|[29.0,10203.18,1....|    0|[3.71080374825935...|[0.97612604829734...|       0.0|
|[29.0,11274.46,1....|    0|[4.39058453619493...|[0.98775823543341...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.55749176407943...|[0.97228005685650...|       0.0|
|[30.0,8403.78,1.0...|    0|[5.76304532016813...|[0.99686830825215...|       0.0|
|[30.0,8874.83,0

In [45]:
eval=BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="churn")


In [46]:
auc=eval.evaluate(pred_labels.predictions)

In [47]:
auc

0.7456808943089431

ROC eğrisi (alıcı çalışma özelliği eğrisi), tüm sınıflandırma eşiklerinde bir sınıflandırma modelinin performansını gösteren grafiktir.
AUC, ROC Eğrisi altındaki "Alan" anlamına gelir. Yani AUC, ROC eğrisinin tamamı (integral kalkülüs) altında bulunan (0,0) ile (1,1) arasındaki iki boyutlu alanın tamamını ölçer.
AUC değeri 0 ile 1 arasında değişir. Tahminleri% 100 yanlış olan bir modelin AUC'si 0.0'dır; tahminleri% 100 doğru olan AUC'nin AUC'si 1.0'dır.
auc=area under the curve

In [60]:
from pyspark.ml.classification import GBTClassifier

In [69]:
gbt = GBTClassifier(labelCol="churn", maxIter=20)

In [70]:
gbtm=gbt.fit(train)

In [71]:
pred = gbtm.transform(test)

In [72]:
pred.show() 

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[-0.2682540984401...|[0.36900023787308...|       1.0|
|[28.0,9090.43,1.0...|    0|[-0.5994739924844...|[0.23166241699189...|       1.0|
|[28.0,11204.23,0....|    0|[-0.6295577638867...|[0.22112618676346...|       1.0|
|[28.0,11245.38,0....|    0|[1.24815436878047...|[0.92388264328542...|       0.0|
|[29.0,9617.59,0.0...|    0|[1.23681826060691...|[0.92227285185462...|       0.0|
|[29.0,10203.18,1....|    0|[1.37726661364395...|[0.94016885949743...|       0.0|
|[29.0,11274.46,1....|    0|[1.44774276996281...|[0.94762281932138...|       0.0|
|[30.0,6744.87,0.0...|    0|[0.75969148469085...|[0.82044760172677...|       0.0|
|[30.0,8403.78,1.0...|    0|[1.46226387468729...|[0.94904569751174...|       0.0|
|[30.0,8874.83,0

In [81]:
print('Accuracy:', eval.evaluate(pred))

Accuracy: 0.7048399390243902


In [82]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [83]:
eval_accuracy = MulticlassClassificationEvaluator(labelCol="churn", predictionCol="prediction", metricName="accuracy")

In [84]:
acc = eval_accuracy.evaluate(pred)

In [86]:
print("Prediction Accuracy: ", acc)

Prediction Accuracy:  0.8454935622317596


In [89]:
from sklearn.metrics import confusion_matrix

In [90]:
y_pred=pred.select("prediction").collect()
y_orig=pred.select("churn").collect()

In [91]:
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[177  15]
 [ 21  20]]


doğru tahminlerin sayısı:177+20=197