In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

# Step 2: Load Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Step 3: Data Cleaning
df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)
df.dropna(subset=['Embarked'], inplace=True)

# Step 4: Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
le_sex = LabelEncoder()
le_embarked = LabelEncoder()
df['Sex'] = le_sex.fit_transform(df['Sex'])       # male=1, female=0
df['Embarked'] = le_embarked.fit_transform(df['Embarked'])  # S=2, C=0, Q=1

# Step 5: Define Features and Target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_pred_lr_binary = [1 if prob >= 0.5 else 0 for prob in y_pred_lr]

print("Linear Regression Evaluation")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lr))
print("R² Score:", r2_score(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr_binary))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_binary))

# Step 8: Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("\n Decision Tree Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Linear Regression Evaluation
Mean Squared Error: 0.1461031869648124
R² Score: 0.3845056008784582
Accuracy: 0.7808988764044944
Confusion Matrix:
 [[87 22]
 [17 52]]

 Decision Tree Evaluation
Accuracy: 0.7640449438202247
Confusion Matrix:
 [[83 26]
 [16 53]]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
