In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName('Trees').getOrCreate()

In [2]:
# Load the U.S. News and World Report’s College Dataset
data = spark.read.csv(r"C:\Users\lucky\Downloads\College.csv",inferSchema=True,header=True)

# Print the schema of the dataset
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [3]:
# Display the first few rows of the dataset
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

In [4]:
# Prepare features using VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
feature_cols = ['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad',
                'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate']

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
output = assembler.transform(data)

In [6]:
# Convert the target label "Private" to numeric index
from pyspark.ml.feature import StringIndexer

In [7]:
indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)

In [8]:
# Select the final data with features and target label
final_data = output_fixed.select("features", "PrivateIndex")

In [9]:
# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [10]:
# Train different classifiers (DecisionTreeClassifier and RandomForestClassifier)
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

In [11]:
dt_classifier = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rf_classifier = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')

In [12]:
# Fit the models
dt_model = dt_classifier.fit(train_data)
rf_model = rf_classifier.fit(train_data)

In [13]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)

In [14]:
# Evaluate the models' performance using accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
dt_accuracy = evaluator.evaluate(dt_predictions)
rf_accuracy = evaluator.evaluate(rf_predictions)

In [16]:
# Display the results
print("Here are the results!")
print('-'*80)
print('A single Decision Tree had an accuracy of: {0:2.2f}%'.format(dt_accuracy*100))
print('-'*80)
print('A Random Forest ensemble had an accuracy of: {0:2.2f}%'.format(rf_accuracy*100))

Here are the results!
--------------------------------------------------------------------------------
A single Decision Tree had an accuracy of: 92.31%
--------------------------------------------------------------------------------
A Random Forest ensemble had an accuracy of: 95.48%
