In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Amazon Reviews Sentiment Analysis") \
    .getOrCreate()

# Load Data
data = spark.read.csv('test.csv', inferSchema=True, header=True)

# Preprocessing Steps
# Tokenize words
regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Convert words to feature vectors
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Convert label to numeric
label_stringIdx = StringIndexer(inputCol="label", outputCol="labelIndex")

# Pipeline
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx])

# Apply transformations
pipelineModel = pipeline.fit(data)
dataset = pipelineModel.transform(data)

dataset.show(5)


23/12/11 22:05:54 WARN StopWordsRemover: Default locale set was [en_SA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
                                                                                

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               title|             content|               words|            filtered|         rawFeatures|            features|labelIndex|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1|            Great CD|"My lovely Pat ha...|[my, lovely, pat,...|[lovely, pat, one...|(262144,[3370,218...|(262144,[3370,218...|       1.0|
|    1|One of the best g...|Despite the fact ...|[despite, the, fa...|[despite, fact, p...|(262144,[6946,844...|(262144,[6946,844...|       1.0|
|    0|Batteries died wi...|I bought this cha...|[i, bought, this,...|[bought, charger,...|(262144,[1578,576...|(262144,[1578,576...|       0.0|
|    1|works fine, but M...|Check out Maha En...|[check, out, maha...|[check, maha, ene...|(262144,[82005,10...|(262144,[82005,10.

23/12/11 22:06:03 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


In [7]:
train_data, test_data = dataset.randomSplit([0.7, 0.3])


In [8]:
# Define Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='labelIndex')

# Train the model
lrModel = lr.fit(train_data)

# Make predictions
predictions = lrModel.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="labelIndex", rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ", accuracy)


23/12/11 22:06:26 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/11 22:06:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGSchedul

Accuracy:  0.7625791932943319


                                                                                

## Random Forest Classifier

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("StudentPerformanceRegression").getOrCreate()

# Load data
data = spark.read.csv('students.csv', inferSchema=True, header=True)

# Preprocess data
# Encode categorical variable
indexer = StringIndexer(inputCol="Extracurricular Activities", outputCol="ActivitiesIndex")
data = indexer.fit(data).transform(data)

# Assemble features
assembler = VectorAssembler(inputCols=["Hours Studied", "Previous Scores", "ActivitiesIndex", "Sleep Hours", "Sample Question Papers Practiced"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Split data
train_data, test_data = data.randomSplit([0.7, 0.3])

# Model 1: Linear Regression
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Performance Index")

# Model 2: Decision Tree Regression
dt = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol="Performance Index")

# Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=[assembler, scaler, lr])
model_lr = pipeline_lr.fit(train_data)
predictions_lr = model_lr.transform(test_data)

# Pipeline for Decision Tree Regression
pipeline_dt = Pipeline(stages=[assembler, scaler, dt])
model_dt = pipeline_dt.fit(train_data)
predictions_dt = model_dt.transform(test_data)

# Evaluate models
evaluator = RegressionEvaluator(labelCol="Performance Index", metricName="rmse")
rmse_lr = evaluator.evaluate(predictions_lr)
rmse_dt = evaluator.evaluate(predictions_dt)

print(f"Linear Regression RMSE: {rmse_lr}")
print(f"Decision Tree Regression RMSE: {rmse_dt}")


23/12/12 00:10:09 WARN Instrumentation: [7fa8a9a0] regParam is zero, which might cause numerical instability and overfitting.
23/12/12 00:10:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/12 00:10:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Linear Regression RMSE: 2.041183036059606
Decision Tree Regression RMSE: 3.6895852597330028
