#### practice

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('log_practice').getOrCreate()

In [19]:
my_data = spark.read.csv('titanic.csv', inferSchema = True, header = True)

In [2]:
from pyspark.ml.classification import LogisticRegression

In [20]:
my_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [22]:
my_data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [23]:
my_cols = my_data.select(['Survived','Pclass','Sex','Age','SibSp','Parch',
                          'Ticket','Fare','Cabin','Embarked'])

In [24]:
# missing data
df = my_cols.na.drop()

In [25]:
# categorical data
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder

In [32]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol = 'EmbarkIndex', outputCol = 'EmbarkVec')

In [33]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol = 'SexIndex', outputCol = 'SexVec')

In [34]:
assembler = VectorAssembler(inputCols = ['Pclass','SexVec','Age','SibSp','Parch',
                                        'Fare','EmbarkVec'], outputCol = 'features')

In [35]:
from pyspark.ml.pipeline import Pipeline

In [36]:
log_reg = LogisticRegression(featuresCol = 'features', labelCol='Survived')

In [37]:
pipeline = Pipeline(stages = [gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg])

In [38]:
train, test = df.randomSplit([0.7,0.3])

In [39]:
logreg = pipeline.fit(train)

In [40]:
result = logreg.transform(test)

In [42]:
result.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [43]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol = 'Survived')

In [44]:
AUC = my_eval.evaluate(result)

In [45]:
AUC

0.6994047619047619

#### logistic consluting project

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('log_reg').getOrCreate()

In [3]:
data = spark.read.csv('customer_churn.csv', inferSchema = True, header = True)

In [5]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [6]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|       Onboard_date|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|               null|                null|                null|0.16666666666666666|
| stddev| 

In [7]:
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols = ['Age','Total_Purchase','Account_Manager','Years','Num_Sites'], 
                            outputCol = 'features')

In [11]:
output = assembler.transform(data)

In [12]:
final_data = output.select('features', 'churn')

In [13]:
train, test = final_data.randomSplit([0.7, 0.3])

In [14]:
from pyspark.ml.classification import LogisticRegression

In [15]:
lr = LogisticRegression(featuresCol='features', labelCol='churn')

In [16]:
lr_model = lr.fit(train)

In [18]:
lr_summary = lr_model.summary

In [19]:
lr_summary.accuracy

0.8909090909090909

In [20]:
lr_summary.predictions.describe().show()

+-------+-------------------+------------------+
|summary|              churn|        prediction|
+-------+-------------------+------------------+
|  count|                605|               605|
|   mean|0.16363636363636364|0.1305785123966942|
| stddev|0.37025139124010303|0.3372176216806523|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [22]:
pred_and_labels = lr_model.evaluate(test)

In [23]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.50873751712712...|[0.98910759669241...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.82872891346846...|[0.99960202722735...|       0.0|
|[29.0,5900.78,1.0...|    0|[4.11795592957316...|[0.98398296796336...|       0.0|
|[29.0,9378.24,0.0...|    0|[4.84007562741067...|[0.99215556559197...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.07022663998568...|[0.95564777956159...|       0.0|
|[30.0,8677.28,1.0...|    0|[4.16489850989443...|[0.98470624073455...|       0.0|
|[30.0,10183.98,1....|    0|[2.84535271468477...|[0.94507796095440...|       0.0|
|[30.0,10744.14,1....|    1|[1.73416975767731...|[0.84994500124043...|       0.0|
|[30.0,11575.37,1....|    1|[3.91824451004333...|[0.98051139841367...|       0.0|
|[30.0,13473.35,

In [25]:
evalu = BinaryClassificationEvaluator(labelCol = 'churn', rawPredictionCol='prediction')

In [26]:
auc = evalu.evaluate(pred_and_labels.predictions)

In [27]:
auc

0.7389504982320798

In [28]:
# predict on new data
final_lr = lr.fit(final_data) 

In [29]:
new_customers = spark.read.csv('new_customers.csv', inferSchema = True, header = True)

In [30]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [32]:
test_new = assembler.transform(new_customers)

In [33]:
test_new.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [34]:
res = final_lr.transform(test_new)

In [35]:
res.select('company','prediction').show()

+----------------+----------+
|         company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

