<a href="https://colab.research.google.com/github/NadirYasar/Customers_Churn_-dev1/blob/main/Customers_Churn_XGBOOST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark



In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('CustomerChurn').getOrCreate()

In [5]:
data = spark.read.csv('/content/customer_churn.csv',inferSchema=True,header=True)
data.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import unix_timestamp, current_date, to_date
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

# Spark oturumu başlatma
spark = SparkSession.builder.appName("XGBoostChurn").getOrCreate()

# Veriyi yükleme
# Veri setinin adını ve yolunu buraya ekleyin
# data = spark.read.csv('your_dataset.csv', header=True, inferSchema=True)

# Tarih sütununu düzenleme ve yeni bir sütun ekleme
data = data.withColumn("Onboard_date", to_date("Onboard_date", "yyyy-MM-dd HH:mm:ss"))

# Days_onboard hesaplaması
data = data.withColumn(
    "Days_onboard",
    (unix_timestamp(current_date()) - unix_timestamp("Onboard_date")) / (60 * 60 * 24)
)

# Gereksiz sütunları silme
data = data.drop("Names", "Location", "Onboard_date", "Company")

# Özellikler ve hedef sütunlar
feature_cols = ["Age", "Total_Purchase", "Years", "Num_Sites", "Days_onboard", "Account_Manager"]

# VectorAssembler ile özellikleri birleştiriyoruz
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Veriyi dönüştürme
data = assembler.transform(data)  # Feature'ları birleştiriyoruz

# Eğitim ve test verilerini oluşturma
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# XGBoost için veri hazırlığı
def spark_to_numpy(df, features_col, label_col):
    """Veriyi Spark DataFrame'den numpy array'e dönüştürür."""
    features = np.array(df.select(features_col).rdd.map(lambda x: x[0].toArray()).collect())
    labels = np.array(df.select(label_col).rdd.map(lambda x: x[0]).collect())
    return features, labels

# Eğitim verisini numpy array formatına çevirme
X_train, y_train = spark_to_numpy(train_data, "features", "Churn")

# Test verisini numpy array formatına çevirme
X_test, y_test = spark_to_numpy(test_data, "features", "Churn")

# XGBoost DMatrix oluşturma
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost parametreleri
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 6,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": 10  # Veri dengesizliği için
}

# Modeli eğitme
num_round = 100
model = xgb.train(params, dtrain, num_round, [(dtrain, 'train'), (dtest, 'eval')], early_stopping_rounds=10)

# Test verisi üzerinde tahminler yapma
predictions = model.predict(dtest)

# AUC hesaplama (sklearn ile)
auc_score = roc_auc_score(y_test, predictions)
print(f"AUC (cross-validation): {auc_score}")

# PySpark DataFrame'ine dönüştürme (Predictions ve Churn sütunlarını birleştiriyoruz)
result_df = pd.DataFrame({"prediction": predictions, "Churn": y_test})
pred_df = spark.createDataFrame(result_df)

# Değerlendirme: AUC metriği
evaluator = BinaryClassificationEvaluator(labelCol="Churn", rawPredictionCol="prediction", metricName="areaUnderROC")

# AUC değeri hesaplama
auc = evaluator.evaluate(pred_df)
print(f"AUC (XGBoost): {auc}")


[0]	train-auc:0.92925	eval-auc:0.85345
[1]	train-auc:0.94761	eval-auc:0.78463
[2]	train-auc:0.97137	eval-auc:0.85460
[3]	train-auc:0.97827	eval-auc:0.83434
[4]	train-auc:0.98502	eval-auc:0.82284
[5]	train-auc:0.98928	eval-auc:0.82443
[6]	train-auc:0.98981	eval-auc:0.81925
[7]	train-auc:0.99297	eval-auc:0.83879
[8]	train-auc:0.99347	eval-auc:0.85718
[9]	train-auc:0.99332	eval-auc:0.85805
[10]	train-auc:0.99315	eval-auc:0.85546
[11]	train-auc:0.99285	eval-auc:0.86494
[12]	train-auc:0.99365	eval-auc:0.86379
[13]	train-auc:0.99356	eval-auc:0.87241
[14]	train-auc:0.99393	eval-auc:0.87759
[15]	train-auc:0.99386	eval-auc:0.88333
[16]	train-auc:0.99406	eval-auc:0.88851
[17]	train-auc:0.99405	eval-auc:0.89109
[18]	train-auc:0.99506	eval-auc:0.89109
[19]	train-auc:0.99557	eval-auc:0.88736
[20]	train-auc:0.99607	eval-auc:0.88879
[21]	train-auc:0.99594	eval-auc:0.89138
[22]	train-auc:0.99573	eval-auc:0.89799
[23]	train-auc:0.99604	eval-auc:0.90144
[24]	train-auc:0.99623	eval-auc:0.90402
[25]	train



[31]	train-auc:0.99820	eval-auc:0.90316
[32]	train-auc:0.99833	eval-auc:0.90431
[33]	train-auc:0.99858	eval-auc:0.90316
[34]	train-auc:0.99855	eval-auc:0.90431
[35]	train-auc:0.99863	eval-auc:0.90460
[36]	train-auc:0.99871	eval-auc:0.90661
[37]	train-auc:0.99895	eval-auc:0.90747
[38]	train-auc:0.99897	eval-auc:0.90632
[39]	train-auc:0.99917	eval-auc:0.90833
[40]	train-auc:0.99925	eval-auc:0.90690
[41]	train-auc:0.99917	eval-auc:0.90690
[42]	train-auc:0.99938	eval-auc:0.90690
[43]	train-auc:0.99968	eval-auc:0.90690
[44]	train-auc:0.99966	eval-auc:0.90718
[45]	train-auc:0.99963	eval-auc:0.91178
[46]	train-auc:0.99980	eval-auc:0.90948
[47]	train-auc:0.99983	eval-auc:0.91092
[48]	train-auc:0.99987	eval-auc:0.91236
[49]	train-auc:0.99982	eval-auc:0.91121
[50]	train-auc:0.99993	eval-auc:0.91264
[51]	train-auc:0.99996	eval-auc:0.91322
[52]	train-auc:0.99999	eval-auc:0.91236
[53]	train-auc:0.99999	eval-auc:0.91092
[54]	train-auc:1.00000	eval-auc:0.91034
[55]	train-auc:0.99999	eval-auc:0.91063
