In [1]:
# --- STEP 1: Setup Java + PySpark ---
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install -q pyspark


In [2]:

# --- STEP 2: Import dependencies ---
from pyspark import SparkConf, SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from numpy import array
from google.colab import files

In [3]:


# --- STEP 3: Upload CSV file (PastHires.csv) ---
print("📁 Please upload your 'PastHires.csv' file")
uploaded = files.upload()  # Choose the CSV file from your system

# --- STEP 4: Initialize Spark Context ---
conf = SparkConf().setMaster("local[*]").setAppName("SparkDecisionTree")
sc = SparkContext(conf=conf)

# --- STEP 5: Helper functions ---
def binary(YN):
    return 1 if YN == 'Y' else 0

def mapEducation(degree):
    if degree == 'BS':
        return 1
    elif degree == 'MS':
        return 2
    elif degree == 'PhD':
        return 3
    else:
        return 0

def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])
    return LabeledPoint(hired, array([yearsExperience, employed,
                                      previousEmployers, educationLevel,
                                      topTier, interned]))

# --- STEP 6: Load and prepare data ---
rawData = sc.textFile("PastHires.csv")
header = rawData.first()
rawData = rawData.filter(lambda x: x != header)
csvData = rawData.map(lambda x: x.split(","))

# Convert to labeled points
trainingData = csvData.map(createLabeledPoints)

# --- STEP 7: Create a test sample ---
testCandidates = [array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

# --- STEP 8: Train Decision Tree model ---
model = DecisionTree.trainClassifier(
    trainingData,
    numClasses=2,
    categoricalFeaturesInfo={1: 2, 3: 4, 4: 2, 5: 2},
    impurity='gini',
    maxDepth=5,
    maxBins=32
)

# --- STEP 9: Make predictions ---
predictions = model.predict(testData)
print("\n🔮 Hire prediction:")
for result in predictions.collect():
    print("Result:", "Hire" if result == 1.0 else "Don’t hire")

# --- STEP 10: Show the tree structure ---
print("\n🌳 Learned Decision Tree Model:")
print(model.toDebugString())

# --- STEP 11: Stop SparkContext (important in Colab) ---
sc.stop()


📁 Please upload your 'PastHires.csv' file


Saving PastHires.csv to PastHires.csv

🔮 Hire prediction:
Result: Hire

🌳 Learned Decision Tree Model:
DecisionTreeModel classifier of depth 4 with 9 nodes
  If (feature 1 in {0.0})
   If (feature 5 in {0.0})
    If (feature 0 <= 0.5)
     If (feature 3 in {1.0})
      Predict: 0.0
     Else (feature 3 not in {1.0})
      Predict: 1.0
    Else (feature 0 > 0.5)
     Predict: 0.0
   Else (feature 5 not in {0.0})
    Predict: 1.0
  Else (feature 1 not in {0.0})
   Predict: 1.0

