In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Top five rows:\n", df.head())
    print("Last five rows:\n", df.tail())
    print("Dataset Summary:\n", df.describe())
    print("Column Names:\n", df.columns)
    print("Missing Values:\n", df.isnull().sum())
    return df

# Preprocess data
def preprocess_data(df):
    global label_encoder, scaler  # Make encoder & scaler accessible for prediction

    # Drop irrelevant columns
    df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
    
    # Handle missing values
    imputer_age = SimpleImputer(strategy="median")
    df["Age"] = imputer_age.fit_transform(df[["Age"]])
    
    imputer_fare = SimpleImputer(strategy="mean")
    df["Fare"] = imputer_fare.fit_transform(df[["Fare"]])
    
    # Encode categorical variables
    label_encoder = LabelEncoder()
    # Male=1, Female=0
    df["Sex"] = label_encoder.fit_transform(df["Sex"])  
    df["Embarked"] = label_encoder.fit_transform(df["Embarked"])
    
    # Normalize numerical data
    scaler = StandardScaler()
    df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])
    
    return df

# Train model
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# Evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("\nModel Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Predict survival for a new passenger
def predict_survival(model, input_data):
    input_df = pd.DataFrame([input_data])  # Convert dictionary to DataFrame

    # Encode categorical variables
    input_df["Sex"] = input_df["Sex"].apply(lambda x: 1 if x.lower() == "male" else 0)
    input_df["Embarked"] = label_encoder.transform([input_df["Embarked"]])[0]

    # Normalize numerical data
    input_df[["Age", "Fare"]] = scaler.transform(input_df[["Age", "Fare"]])

    # Predict survival
    prediction = model.predict(input_df)
    return "Survived" if prediction[0] == 1 else "Did Not Survive"


# Main execution
def main():
    # Load and preprocess data
    df = load_data("tested.csv")
    df = preprocess_data(df)
    
    # Split data into features and target variable
    X = df.drop(columns=["Survived"])
    y = df["Survived"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate model
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)

    # Example prediction
    new_passenger = {
        "Pclass": 3,        # Ticket class (1st, 2nd, 3rd)
        "Sex": "female",    # Gender
        "Age": 28,         # Age
        "SibSp": 0,        # Number of siblings/spouses aboard
        "Parch": 0,        # Number of parents/children aboard
        "Fare": 7.25,      # Ticket fare
        "Embarked": "S"    # Port of Embarkation (C, Q, S)
    }

    # Predict survival for the new passenger
    result = predict_survival(model, new_passenger)
    print("\nPrediction for New Passenger:", result)

if __name__ == "__main__":
    main()


Top five rows:
    PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
Last five rows:
      PassengerId  Survived  Pclass                   

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
