In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('titanic.csv')

In [5]:
print("First 5 rows of the dataset:")
print(df.head())

print("\nMissing values before cleaning:")
print(df.isnull().sum())

First 5 rows of the dataset:
   Passengerid   Age     Fare  Sex  sibsp  zero  zero.1  zero.2  zero.3  \
0            1  22.0   7.2500    0      1     0       0       0       0   
1            2  38.0  71.2833    1      1     0       0       0       0   
2            3  26.0   7.9250    1      0     0       0       0       0   
3            4  35.0  53.1000    1      1     0       0       0       0   
4            5  35.0   8.0500    0      0     0       0       0       0   

   zero.4  ...  zero.12  zero.13  zero.14  Pclass  zero.15  zero.16  Embarked  \
0       0  ...        0        0        0       3        0        0       2.0   
1       0  ...        0        0        0       1        0        0       0.0   
2       0  ...        0        0        0       3        0        0       2.0   
3       0  ...        0        0        0       1        0        0       2.0   
4       0  ...        0        0        0       3        0        0       2.0   

   zero.17  zero.18  2urvived  
0

In [6]:
if 'Age' in df.columns:
    df['Age'] = df['Age'].fillna(df['Age'].median())

In [7]:
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [8]:
if 'Fare' in df.columns:
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

In [9]:
if 'Pclass' in df.columns:
    df['Pclass'] = df['Pclass'].fillna(df['Pclass'].mode()[0])

In [10]:
if 'SibSp' in df.columns:
    df['SibSp'] = df['SibSp'].fillna(0)

In [11]:
if 'Parch' in df.columns:
    df['Parch'] = df['Parch'].fillna(0)

In [12]:
if 'Sex' in df.columns and df['Sex'].dtype == 'int64':
    df['Sex'] = df['Sex'].map({0: 'male', 1: 'female'})

In [13]:
if 'Cabin' in df.columns:
    df['Cabin'] = df['Cabin'].fillna('Unknown')

In [14]:
print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
Passengerid    0
Age            0
Fare           0
Sex            0
sibsp          0
zero           0
zero.1         0
zero.2         0
zero.3         0
zero.4         0
zero.5         0
zero.6         0
Parch          0
zero.7         0
zero.8         0
zero.9         0
zero.10        0
zero.11        0
zero.12        0
zero.13        0
zero.14        0
Pclass         0
zero.15        0
zero.16        0
Embarked       0
zero.17        0
zero.18        0
2urvived       0
dtype: int64


In [15]:
df.to_csv('titanic_cleaned.csv', index=False)
print("\nCleaned dataset saved as 'titanic_cleaned.csv'")


Cleaned dataset saved as 'titanic_cleaned.csv'


In [16]:
print("\nFirst 5 rows of the cleaned dataset:")
print(df.head())


First 5 rows of the cleaned dataset:
   Passengerid   Age     Fare     Sex  sibsp  zero  zero.1  zero.2  zero.3  \
0            1  22.0   7.2500    male      1     0       0       0       0   
1            2  38.0  71.2833  female      1     0       0       0       0   
2            3  26.0   7.9250  female      0     0       0       0       0   
3            4  35.0  53.1000  female      1     0       0       0       0   
4            5  35.0   8.0500    male      0     0       0       0       0   

   zero.4  ...  zero.12  zero.13  zero.14  Pclass  zero.15  zero.16  Embarked  \
0       0  ...        0        0        0       3        0        0       2.0   
1       0  ...        0        0        0       1        0        0       0.0   
2       0  ...        0        0        0       3        0        0       2.0   
3       0  ...        0        0        0       1        0        0       2.0   
4       0  ...        0        0        0       3        0        0       2.0   

   zer

1) LOGISTIC REGRESSION

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
df = pd.read_csv('titanic_cleaned.csv')
print("First 5 rows of the cleaned dataset:")
print(df.head())

First 5 rows of the cleaned dataset:
   Passengerid   Age     Fare     Sex  sibsp  zero  zero.1  zero.2  zero.3  \
0            1  22.0   7.2500    male      1     0       0       0       0   
1            2  38.0  71.2833  female      1     0       0       0       0   
2            3  26.0   7.9250  female      0     0       0       0       0   
3            4  35.0  53.1000  female      1     0       0       0       0   
4            5  35.0   8.0500    male      0     0       0       0       0   

   zero.4  ...  zero.12  zero.13  zero.14  Pclass  zero.15  zero.16  Embarked  \
0       0  ...        0        0        0       3        0        0       2.0   
1       0  ...        0        0        0       1        0        0       0.0   
2       0  ...        0        0        0       3        0        0       2.0   
3       0  ...        0        0        0       1        0        0       2.0   
4       0  ...        0        0        0       3        0        0       2.0   

   zero

In [19]:
# Convert 'Sex' column to numeric
if 'Sex' in df.columns:
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' column to numeric
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].astype('category').cat.codes

In [20]:
X = df[['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked']]
y = df['2urvived']  # Fixing 'Survived' column name

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.77

Confusion Matrix:
[[174  15]
 [ 46  27]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       189
           1       0.64      0.37      0.47        73

    accuracy                           0.77       262
   macro avg       0.72      0.65      0.66       262
weighted avg       0.75      0.77      0.74       262



In [25]:
# Example input: Age=30, Fare=50, Sex=Male (0), SibSp=1, Parch=0, Pclass=3, Embarked=2
new_passenger = np.array([[30, 50, 0, 1, 0, 3, 2]])

# Scale the input
new_passenger = scaler.transform(new_passenger)

# Predict survival
prediction = model.predict(new_passenger)
print("\nPrediction (1=Survived, 0=Not Survived):", prediction[0])


Prediction (1=Survived, 0=Not Survived): 0




2) RANDOM FOREST

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [27]:
# Load the cleaned dataset (Ensure 'titanic_cleaned.csv' is in your working directory)
df = pd.read_csv('titanic_cleaned.csv')

In [28]:
# Rename '2urvived' to 'Survived' if necessary
df.rename(columns={'2urvived': 'Survived'}, inplace=True)

In [29]:
# Select features and target variable
X = df[['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked']]
y = df['Survived']

In [30]:
# Make a copy to avoid modifying a slice of the original DataFrame
X = X.copy()

# Convert 'Sex' column to numerical (male = 0, female = 1)
X.loc[:, 'Sex'] = X['Sex'].map({'male': 0, 'female': 1})

# Fill missing values in 'Embarked' with the mode (most frequent value)
X.loc[:, 'Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

# Convert 'Embarked' to integer type
X.loc[:, 'Embarked'] = X['Embarked'].astype(int)

In [31]:
# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [33]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Model Accuracy: {accuracy:.4f}')

Random Forest Model Accuracy: 0.7099


In [34]:
# Example input: Age=25, Fare=$50, Female, 1 sibling/spouse, 0 parents/children, Pclass=3, Embarked=2
sample_input = np.array([[25, 50, 1, 1, 0, 3, 2]])
sample_prediction = model.predict(sample_input)

# Display result
print(f'Prediction for sample input: {"Survived" if sample_prediction[0] == 1 else "Not Survived"}')

Prediction for sample input: Not Survived




3) K-Nearest Neighbors (KNN)

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer  # For handling missing values

# Load the Titanic dataset
df = pd.read_csv('titanic_cleaned.csv')

# Step 1: Rename '2urvived' to 'Survived'
df.rename(columns={'2urvived': 'Survived'}, inplace=True)

# Step 2: Drop all 'zero' columns
columns_to_drop = [col for col in df.columns if 'zero' in col]
df.drop(columns=columns_to_drop, inplace=True)

# Step 3: Strip any whitespace characters from column names
df.columns = df.columns.str.strip()

# Step 4: Handle missing values
# Fill missing values for 'Sex' column with its mode (most frequent value)
df['Sex'] = df['Sex'].fillna(df['Sex'].mode()[0])

# Fill missing values for 'Embarked' column with its mode (most frequent value)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Fill missing values for 'Age' column with its median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing values for 'Fare' column with its median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Step 5: Convert categorical columns to numeric (if not done already)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Step 6: Select features and target variable
X = df[['Age', 'Fare', 'Sex', 'sibsp', 'Pclass', 'Embarked']]  # Independent variables
y = df['Survived']  # Target variable

# Step 7: Check for columns with all missing values
columns_with_all_missing = X.columns[X.isnull().all()].tolist()
print("Columns with all missing values:", columns_with_all_missing)

# Step 8: Handle columns with all missing values
# If a column has all missing values, fill it with a constant value (e.g., 0)
for col in columns_with_all_missing:
    X[col] = X[col].fillna(0)

# Step 10: Impute missing values in X (if any)
imputer = SimpleImputer(strategy='median')  # Use median imputation for numeric columns
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 12: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 13: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 14: Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values for 'n_neighbors'
knn.fit(X_train_scaled, y_train)

# Step 15: Make predictions on the test set
y_pred = knn.predict(X_test_scaled)

# Step 16: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Columns with all missing values: ['Embarked']

Accuracy: 0.7557251908396947

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       189
           1       0.57      0.49      0.53        73

    accuracy                           0.76       262
   macro avg       0.69      0.68      0.68       262
weighted avg       0.75      0.76      0.75       262


Confusion Matrix:
[[162  27]
 [ 37  36]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(0)


In [36]:
# Example input: Age=25, Fare=$50, Female, 1 sibling/spouse, 0 parents/children, Pclass=3, Embarked=2
sample_input = np.array([[25, 50, 1, 1, 0, 3, 2]])
sample_prediction = model.predict(sample_input)

# Display result
print(f'Prediction for sample input: {"Survived" if sample_prediction[0] == 1 else "Not Survived"}')

Prediction for sample input: Not Survived




In [37]:
import pandas as pd

# Accuracy results from the three models
accuracy_results = {
    'Algorithm': ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors (KNN)'],
    'Accuracy': [0.77, 0.7099, 0.7557]  # Replace with the actual accuracy values from your models
}

# Create a DataFrame to display the results
results_df = pd.DataFrame(accuracy_results)

# Print the table
print("Accuracy Results of Different Algorithms:")
print(results_df)

# Find the algorithm with the maximum accuracy
max_accuracy = results_df['Accuracy'].max()
best_algorithm = results_df.loc[results_df['Accuracy'] == max_accuracy, 'Algorithm'].values[0]

# Print the conclusion
print(f"\nConclusion: The algorithm with the highest accuracy is {best_algorithm} with an accuracy of {max_accuracy:.4f}.")

Accuracy Results of Different Algorithms:
                   Algorithm  Accuracy
0        Logistic Regression    0.7700
1              Random Forest    0.7099
2  K-Nearest Neighbors (KNN)    0.7557

Conclusion: The algorithm with the highest accuracy is Logistic Regression with an accuracy of 0.7700.
