In [2]:
!pip install pyspark --quiet
!pip install -U -q PyDrive --quiet 
!apt install openjdk-8-jdk-headless &> /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

[K     |████████████████████████████████| 281.4 MB 23 kB/s 
[K     |████████████████████████████████| 198 kB 48.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
import pyspark
from pyspark.sql import SparkSession

sc = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
data = sc.read \
  .option('header', 'True')\
  .option('inferSchema', 'True')\
  .option('sep', ',')\
  .csv('/content/drive/MyDrive/data.csv')

# Without Preprocessing

In [72]:
data=data.where(data['Label'].isNull()== False)

In [80]:
data = data.distinct()
data.count()

7630

In [65]:
my_cols = data.select(data.columns)
data = my_cols.na.drop()

In [81]:
numeric_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

categorical_columns = []
for col in data.columns:
  if col not in numeric_features:
    categorical_columns.append(col)

categorical_columns

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Label']

In [74]:
data=data.where(data['Label'].isNull()== False)

for column in categorical_columns:
  data = data.na.fill(value=" ",subset=[column])

for column in numeric_features:
  data = data.na.fill(value=0.0,subset=[column])

In [51]:
# unimportant_features = ['StreamingTV']
# data = data.drop(*tuple(unimportant_features))

In [82]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)
from pyspark.ml import Pipeline

stages = []

categorical_columns.remove('Label')

for feature in categorical_columns:
  stringIndexer = StringIndexer(inputCol=feature, outputCol=feature + 'Index')
  encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],outputCols=[feature + 'Vec'])
  stages+=[stringIndexer, encoder]

labelIndexer = StringIndexer(inputCol='Label', outputCol='label')
stages += [labelIndexer]
input_features = [c + 'Vec' for c in categorical_columns]+ numeric_features
assembler = VectorAssembler(inputCols= input_features, outputCol='features')
stages+=[assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(data)
assembler_df = pipelineModel.transform(data)

In [83]:
from pyspark.ml.classification import LogisticRegression
log_reg= LogisticRegression(featuresCol='features',labelCol='label')

train_data, test_data = assembler_df.randomSplit([0.8, 0.2])
fit_model = log_reg.fit(train_data)
results = fit_model.transform(test_data)

In [84]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='label')
results.select('label','prediction')
AUC = my_eval.evaluate(results)
print("AUC score is : ",AUC)

accuracy = results.filter(results.label == results.prediction).count() / float(results.count())
print("Accuracy : ",accuracy)

AUC score is :  0.6752841950861752
Accuracy :  0.7641509433962265
