<a href="https://colab.research.google.com/github/Krupa049/Machine-Learning-Model/blob/main/ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("UserBehaviorPipelineWithOptimization") \
    .getOrCreate()

# Load the dataset
file_path = "/content/user_behavior_dataset.csv"  # Update this path if necessary
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Rename the label column for easier access
data = data.withColumnRenamed("User Behavior Class", "label")

# Split data for training and testing
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# StringIndexers for each categorical column
device_model_indexer = StringIndexer(inputCol="Device Model", outputCol="DeviceModelIndex")
os_indexer = StringIndexer(inputCol="Operating System", outputCol="OSIndex")
gender_indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")

# One-Hot Encoding for indexed columns
device_model_encoder = OneHotEncoder(inputCol="DeviceModelIndex", outputCol="DeviceModelVec")
os_encoder = OneHotEncoder(inputCol="OSIndex", outputCol="OSVec")
gender_encoder = OneHotEncoder(inputCol="GenderIndex", outputCol="GenderVec")

# Define the numeric columns
numeric_features = [
    "App Usage Time (min/day)",
    "Screen On Time (hours/day)",
    "Battery Drain (mAh/day)",
    "Number of Apps Installed",
    "Data Usage (MB/day)",
    "Age"
]

# Assemble all feature columns into a single vector
vector_assembler = VectorAssembler(
    inputCols=numeric_features + ["DeviceModelVec", "OSVec", "GenderVec"],
    outputCol="features"
)

# Standardize the feature vector
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Initialize the classifier with the correct label column
rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="scaledFeatures")

# Create the full pipeline
full_pipeline = Pipeline(stages=[
    device_model_indexer, os_indexer, gender_indexer,
    device_model_encoder, os_encoder, gender_encoder,
    vector_assembler, scaler, rf_classifier
])

# Train the final model
final_model = full_pipeline.fit(train_data)

# Evaluate the final model on the test set
predictions = final_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
final_accuracy = evaluator.evaluate(predictions)
print(f"Final Model Accuracy: {final_accuracy}")


Final Model Accuracy: 1.0
