<a href="https://colab.research.google.com/github/Kpranav123/BDA-Assignment2/blob/main/BDA_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Build a Classification Model with Spark with a dataset of your choice
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Step 1: Start Spark session
spark = SparkSession.builder.appName("BankMarketingClassification").getOrCreate()

# Step 2: Load the dataset
data = spark.read.csv("/content/bank.csv", header=True, inferSchema=True, sep=';')

# Step 3: Data preprocessing
# Convert label column to numeric
label_indexer = StringIndexer(inputCol="y", outputCol="label")

# Convert categorical features
categorical_cols = [field for (field, dtype) in data.dtypes if dtype == "string" and field != "y"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed") for col in categorical_cols]

# Assemble features
numeric_cols = [field for (field, dtype) in data.dtypes if dtype != "string"]
indexed_categorical_cols = [col + "_indexed" for col in categorical_cols]
assembler = VectorAssembler(inputCols=numeric_cols + indexed_categorical_cols, outputCol="features")

# Step 4: Define classifier
classifier = LogisticRegression(featuresCol="features", labelCol="label")

# Step 5: Build pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler, classifier])

# Step 6: Split data
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Step 7: Train model
model = pipeline.fit(train_data)

# Step 8: Evaluate model
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy (AUC): {accuracy}")


Model Accuracy (AUC): 0.8813358538122231


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

# Step 1: Initialize Spark session
spark = SparkSession.builder \
    .appName("ClusteringModel") \
    .getOrCreate()

# Step 2: Load dataset
df = spark.read.csv("/content/Clustering.csv", header=True, inferSchema=True)

# Optional: Preview data
df.show(5)

# Step 3: Assemble features into a single vector
feature_columns = df.columns  # Assuming all columns are features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(df)

# Step 4: Build the KMeans model
kmeans = KMeans(k=3, seed=1, featuresCol="features", predictionCol="cluster") # predictionCol is set to "cluster"
model = kmeans.fit(assembled_data)

# Step 5: Make predictions
predictions = model.transform(assembled_data)

# Step 6: Evaluate the model
evaluator = ClusteringEvaluator(predictionCol="cluster") # Changed predictionCol to "cluster" to match KMeans
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette}")

# Optional: Show cluster assignments
predictions.select(feature_columns + ['cluster']).show()

+---+----------------+----------------+
|_c0|               x|               y|
+---+----------------+----------------+
|  1|3.36759599170382|3.53669397567831|
|  2| 2.6678697659321|4.47991877277642|
|  3|1.34417120978313|3.28259118956231|
|  4|1.38941378084597|4.68322664948847|
|  5| 1.6446438390549|4.32082237219094|
+---+----------------+----------------+
only showing top 5 rows

Silhouette Score: 0.7444141380227035
+---+-------------------+----------------+-------+
|_c0|                  x|               y|cluster|
+---+-------------------+----------------+-------+
|  1|   3.36759599170382|3.53669397567831|      1|
|  2|    2.6678697659321|4.47991877277642|      1|
|  3|   1.34417120978313|3.28259118956231|      1|
|  4|   1.38941378084597|4.68322664948847|      1|
|  5|    1.6446438390549|4.32082237219094|      1|
|  6|  0.776027424326743|2.65366676282803|      1|
|  7|   3.26410141180453| 3.6927619550371|      1|
|  8| -0.182646568697024|2.59282570702876|      1|
|  9|   1.1382150

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Step 1: Start Spark session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

# Step 2: Load the ratings dataset
ratings = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)
ratings.printSchema()

# Step 3: Preprocess the data
# Keep only necessary columns
ratings = ratings.select("userId", "movieId", "rating")

# Step 4: Train-test split
(training, test) = ratings.randomSplit([0.8, 0.2], seed=42)

# Step 5: Build the ALS model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",  # To handle NaN predictions
    nonnegative=True,
    implicitPrefs=False,
    maxIter=10,
    rank=10,
    regParam=0.1
)

model = als.fit(training)

# Step 6: Evaluate the model
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.2f}")

# Step 7: Generate top 5 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

# Optional: Generate top 5 user recommendations for each movie
# movieRecs = model.recommendForAllItems(5)
# movieRecs.show(5, truncate=False)

# Step 8: Stop Spark session
spark.stop()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

Root-mean-square error = 0.88
+------+---------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                    |
+------+---------------------------------------------------------------------------------------------------+
|1     |[{177593, 6.0177426}, {33649, 5.862716}, {3925, 5.835306}, {96004, 5.6480436}, {3379, 5.6480436}]  |
|2     |[{25771, 4.886084}, {131724, 4.8327866}, {33649, 4.77594}, {171495, 4.6861277}, {184245, 4.672516}]|
|3     |[{6835, 4.908595}, {5746, 4.908595}, {5181, 4.820066}, {4518, 4.7179623}, {7899, 4.4177356}]       |
|4     |[{1241, 4.9914727}, {3851, 4.922374}, {123, 4.9020886}, {31878, 4.8874288}, {1733, 4.858461}]      |
|5     |[{89904, 4.9000654}, {1212, 4.7926