In [8]:
!pip install pyspark
!pip install koalas


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting koalas
  Downloading koalas-0.32.0-py3-none-any.whl (593 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m593.2/593.2 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: koalas
Successfully installed koalas-0.32.0


In [10]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [12]:
spark = SparkSession.builder.appName("Automatic Essay Scoring").getOrCreate()


In [13]:
df = spark.read.csv('/content/drive/My Drive/training_data_essay.csv', header=True, inferSchema=True)


In [14]:
df.printSchema()
df.show()


root
 |-- npm: integer (nullable = true)
 |-- nama_peserta: string (nullable = true)
 |-- jawaban: string (nullable = true)
 |-- soal: integer (nullable = true)
 |-- skor_per_soal: double (nullable = true)

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|        100.0|
|         0|       Admin|Biaya dihitung be...|   2|        100.0|
|         0|       Admin|Hak cipta adalah ...|   3|        100.0|
|         0|       Admin|Dijelaskan kepada...|   4|        100.0|
|         0|       Admin|1. Melindungi dan...|   5|        100.0|
|         0|       Admin|Ruang Komputer, P...|   6|        100.0|
|         0|       Admin|Aturlah posisi pe...|   7|        100.0|
|         0|       Admin|Posisi Kepala dan...|   8|        100.0|
|         0|       Admin|1. Kecocokan soft...|   9|        100.0|
|

In [16]:
df = df.drop('npm', 'skor_per_soal')


In [20]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='soal', outputCol='skor_per_soal')
df = indexer.fit(df).transform(df)


In [21]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF

# Tokenize the essay text
tokenizer = Tokenizer(inputCol='jawaban', outputCol='words')
df = tokenizer.transform(df)

# Remove stop words
stop_words = StopWordsRemover(inputCol='words', outputCol='filtered_words')
df = stop_words.transform(df)

# Apply TF-IDF
hashingTF = HashingTF(inputCol='filtered_words', outputCol='raw_features', numFeatures=10000)
featurized_data = hashingTF.transform(df)
idf = IDF(inputCol='raw_features', outputCol='features')
idf_model = idf.fit(featurized_data)
df = idf_model.transform(featurized_data)


In [22]:
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=42)


In [26]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='skor_per_soal', featuresCol='features', numTrees=10)
model = rf.fit(trainingData)


In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol='skor_per_soal', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)


Accuracy: 0.9545454545454546
