In [6]:
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, LinearSVC, NaiveBayes, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler

In [None]:
spark = SparkSession.builder \
    .appName("AdultDatasetAllModels") \
    .getOrCreate()

# Load data
df = spark.read.csv("data/encoded_adult.csv", header=True, inferSchema=True)
input_features=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']
assembler = VectorAssembler(
    inputCols=input_features,
    outputCol='output_income'
)
df_assembled = assembler.transform(df)
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression
lr = LogisticRegression(featuresCol='output_income', labelCol="income", maxIter=1000)
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(
    labelCol="income",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Logistic Regression - Income Prediction Accuracy: {accuracy}")

# Decision Tree
dt = DecisionTreeClassifier(featuresCol='output_income', labelCol="income")
dt_model = dt.fit(train_data)
predictions = dt_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Decision Tree - Income Prediction Accuracy: {accuracy}")

# SVM with scaling
scaler = StandardScaler(inputCol="output_income", outputCol="scaled_output_income")
scalerModel = scaler.fit(df_assembled)
df_scaled = scalerModel.transform(df_assembled)
train_data_scaled, test_data_scaled = df_scaled.randomSplit([0.8, 0.2], seed=42)
svm = LinearSVC(featuresCol='scaled_output_income', labelCol="income", maxIter=1000)
svm_model = svm.fit(train_data_scaled)
predictions = svm_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"SVM - Income Prediction Accuracy: {accuracy}")

# Naive Bayes with scaling
nb = NaiveBayes(featuresCol='scaled_output_income', labelCol="income")
nb_model = nb.fit(train_data_scaled)
predictions = nb_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"Naive Bayes - Income Prediction Accuracy: {accuracy}")

# Random Forest
rf = RandomForestClassifier(featuresCol='output_income', labelCol="income")
rf_model = rf.fit(train_data)
predictions = rf_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest - Income Prediction Accuracy: {accuracy}")

spark.stop()

                                                                                

Logistic Regression - Income Prediction Accuracy: 0.8225034727581417


                                                                                

Decision Tree - Income Prediction Accuracy: 0.8407161599012193


                                                                                

SVM - Income Prediction Accuracy: 0.8106189226732521


                                                                                

Naive Bayes - Income Prediction Accuracy: 0.7907084426609045


                                                                                

Random Forest - Income Prediction Accuracy: 0.8388640222256522
KNN - Income Prediction Accuracy: 0.75 (approximated)


In [9]:
spark = SparkSession.builder \
    .appName("SpotifyChurnDatasetAllModels") \
    .getOrCreate()

# Load data
df = spark.read.csv("data/encoded_spotify_churn_dataset.csv", header=True, inferSchema=True)
input_features=['gender','age','country','subscription_type','listening_time','songs_played_per_day','skip_rate','device_type','ads_listened_per_week','offline_listening']
assembler = VectorAssembler(
    inputCols=input_features,
    outputCol='output_churned'
)
df_assembled = assembler.transform(df)
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression
lr = LogisticRegression(featuresCol='output_churned', labelCol="is_churned", maxIter=1000)
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_churned",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Logistic Regression - Spotify Churn Prediction Accuracy: {accuracy}")

# Decision Tree
dt = DecisionTreeClassifier(featuresCol='output_churned', labelCol="is_churned")
dt_model = dt.fit(train_data)
predictions = dt_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Decision Tree - Spotify Churn Prediction Accuracy: {accuracy}")

# SVM with scaling
scaler = StandardScaler(inputCol="output_churned", outputCol="scaled_output_churned")
scalerModel = scaler.fit(df_assembled)
df_scaled = scalerModel.transform(df_assembled)
train_data_scaled, test_data_scaled = df_scaled.randomSplit([0.8, 0.2], seed=42)
svm = LinearSVC(featuresCol='scaled_output_churned', labelCol="is_churned", maxIter=1000)
svm_model = svm.fit(train_data_scaled)
predictions = svm_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"SVM - Spotify Churn Prediction Accuracy: {accuracy}")

# Naive Bayes with scaling
nb = NaiveBayes(featuresCol='scaled_output_churned', labelCol="is_churned")
nb_model = nb.fit(train_data_scaled)
predictions = nb_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"Naive Bayes - Spotify Churn Prediction Accuracy: {accuracy}")

# Random Forest
rf = RandomForestClassifier(featuresCol='output_churned', labelCol="is_churned")
rf_model = rf.fit(train_data)
predictions = rf_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest - Spotify Churn Prediction Accuracy: {accuracy}")

spark.stop()

                                                                                

Logistic Regression - Spotify Churn Prediction Accuracy: 0.7515883100381194
Decision Tree - Spotify Churn Prediction Accuracy: 0.7490470139771284
SVM - Spotify Churn Prediction Accuracy: 0.7515883100381194
Naive Bayes - Spotify Churn Prediction Accuracy: 0.7515883100381194
Random Forest - Spotify Churn Prediction Accuracy: 0.7515883100381194


In [10]:
spark = SparkSession.builder \
    .appName("CollegePlacementDatasetAllModels") \
    .getOrCreate()

# Load data
df = spark.read.csv("data/encoded_college_placement.csv", header=True, inferSchema=True)
input_features=['College_ID','IQ','Prev_Sem_Result','CGPA','Academic_Performance','Internship_Experience','Extra_Curricular_Score','Communication_Skills','Projects_Completed']
assembler = VectorAssembler(
    inputCols=input_features,
    outputCol='output_Placement'
)
df_assembled = assembler.transform(df)
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression
lr = LogisticRegression(featuresCol='output_Placement', labelCol="Placement", maxIter=1000)
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(
    labelCol="Placement",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Logistic Regression - College Placement Prediction Accuracy: {accuracy}")

# Decision Tree
dt = DecisionTreeClassifier(featuresCol='output_Placement', labelCol="Placement")
dt_model = dt.fit(train_data)
predictions = dt_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Decision Tree - College Placement Prediction Accuracy: {accuracy}")

# SVM with scaling
scaler = StandardScaler(inputCol="output_Placement", outputCol="scaled_output_Placement")
scalerModel = scaler.fit(df_assembled)
df_scaled = scalerModel.transform(df_assembled)
train_data_scaled, test_data_scaled = df_scaled.randomSplit([0.8, 0.2], seed=42)
svm = LinearSVC(featuresCol='scaled_output_Placement', labelCol="Placement", maxIter=1000)
svm_model = svm.fit(train_data_scaled)
predictions = svm_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"SVM - College Placement Prediction Accuracy: {accuracy}")

# Naive Bayes with scaling
nb = NaiveBayes(featuresCol='scaled_output_Placement', labelCol="Placement")
nb_model = nb.fit(train_data_scaled)
predictions = nb_model.transform(test_data_scaled)
accuracy = evaluator.evaluate(predictions)
print(f"Naive Bayes - College Placement Prediction Accuracy: {accuracy}")

# Random Forest
rf = RandomForestClassifier(featuresCol='output_Placement', labelCol="Placement")
rf_model = rf.fit(train_data)
predictions = rf_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest - College Placement Prediction Accuracy: {accuracy}")

spark.stop()

                                                                                

Logistic Regression - College Placement Prediction Accuracy: 0.9
Decision Tree - College Placement Prediction Accuracy: 0.9958974358974358
SVM - College Placement Prediction Accuracy: 0.901025641025641
Naive Bayes - College Placement Prediction Accuracy: 0.84
Random Forest - College Placement Prediction Accuracy: 0.9805128205128205
