In [4]:
import pandas as pd

# Load train and test datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Display first 5 rows of train set
print(train_df.head())
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark
spark = SparkSession.builder.appName("TitanicPreprocessing").getOrCreate()

# Load datasets
train_spark = spark.read.csv("train.csv", header=True, inferSchema=True)
test_spark = spark.read.csv("test.csv", header=True, inferSchema=True)

# Select important features
train_spark = train_spark.select("Survived", "Pclass", "Sex", "Age", "Fare")
test_spark = test_spark.select("Pclass", "Sex", "Age", "Fare")  # No "Survived" in test

# Convert categorical 'Sex' column
train_spark = train_spark.withColumn("Sex", when(col("Sex") == "male", 1).otherwise(0))
test_spark = test_spark.withColumn("Sex", when(col("Sex") == "male", 1).otherwise(0))

train_spark.show(5)
test_spark.show(5)


+--------+------+---+----+-------+
|Survived|Pclass|Sex| Age|   Fare|
+--------+------+---+----+-------+
|       0|     3|  1|22.0|   7.25|
|       1|     1|  0|38.0|71.2833|
|       1|     3|  0|26.0|  7.925|
|       1|     1|  0|35.0|   53.1|
|       0|     3|  1|35.0|   8.05|
+--------+------+---+----+-------+
only showing top 5 rows

+------+---+----+-------+
|Pclass|Sex| Age|   Fare|
+------+---+----+-------+
|     3|  1|34.5| 7.8292|
|     3|  0|47.0|    7.0|
|     2|  1|62.0| 9.6875|
|     3|  1|27.0| 8.6625|
|     3|  0|22.0|12.2875|
+------+---+----+-------+
only showing top 5 rows



In [7]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Convert categorical column 'Sex' to numerical
train_df["Sex"] = LabelEncoder().fit_transform(train_df["Sex"])

# Drop missing values
train_df = train_df.dropna()

# Define features & target
X = train_df[["Pclass", "Sex", "Age", "Fare"]]
y = train_df["Survived"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Setup MLflow
mlflow.set_experiment("Titanic-Experiments")

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    mlflow.log_metric("accuracy", acc)
    
    mlflow.sklearn.log_model(model, "model")


2025/02/16 16:20:44 INFO mlflow.tracking.fluent: Experiment with name 'Titanic-Experiments' does not exist. Creating a new experiment.
