In [None]:
# !pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import accuracy_score

def get_path(dataset_name,env_name='colab'):
    prefix = 'https://raw.githubusercontent.com/John-Ghaly88/Big_Data_and_NoSQL/main/Datasets/Assessment/'
    if env_name == 'colab':
        return prefix+dataset_name
    else:
        return f'../Datasets/{dataset_name}'

spark = SparkSession.builder.appName('mobileprice_lr').getOrCreate()

df = spark.read_csv(get_path('music_genre_dataset.csv'), header = True, inferSchema = True)

numericCols = ['battery_power', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores','pc', 'px_height', 'px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi']

featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols=numericCols, outputCol="feature_vector")])

featurizationPipelineModel = featurizationPipeline.fit(df)
df = featurizationPipelineModel.transform(df)

train, test = df.randomSplit([0.8, 0.2], seed = 2018)

lr = LogisticRegression(featuresCol = 'feature_vector', labelCol = 'price_range', maxIter=10)

lrModel = lr.fit(train)
predictions = lrModel.transform(test)

true_labels=predictions.select('price_range')
lr_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), lr_predictions.toPandas())
# print("Logistic Regression Accuracy =",accuracy*100,"%")