In [193]:
import pandas as pd

data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["PassengerId"]

def clean(data):
    data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)

    cols = ["Fare", "Parch", "SibSp", "Age", "Pclass"]

    for col in cols:
        data[col] = data[col].fillna(data[col].median())  

    data["Embarked"] = data["Embarked"].fillna("U")      

    return data

data = clean(data)
test = clean(test)


In [194]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [195]:
from sklearn import preprocessing

lable = preprocessing.LabelEncoder()

cols = ["Sex" , "Embarked"]

for col in cols:
    data[col] = lable.fit_transform(data[col])
    test[col] = lable.transform(test[col])
    print(lable.classes_) 

['female' 'male']
['C' 'Q' 'S' 'U']


In [196]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = data.drop("Survived" , axis=1)
y = data["Survived"]

X_train, X_val , y_train , y_val = train_test_split(X , y , test_size=0.2 , random_state=46)

In [197]:
clf = LogisticRegression(random_state=0 , max_iter=1000).fit(X_train , y_train)
rf = RandomForestClassifier(n_estimators=100, random_state=46)


In [198]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

In [199]:
prediction = clf.predict(X_val)
from sklearn.metrics import accuracy_score

accuracy_score(y_val , prediction)

0.8435754189944135

In [200]:
print("Random Forest Accuracy:", accuracy_score(y_val, rf_pred))


Random Forest Accuracy: 0.8268156424581006


In [201]:
submission_pred = clf.predict(test)
submission_pred2 = rf.predict(test)

In [202]:
df = pd.DataFrame({"PassengerId" : test_ids.values,
                   "Survived" : submission_pred})
df.to_csv("LogisticRegressionSubmission.csv" , index=False)

In [203]:
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_pred})
df.to_csv("RandomForestSubmission.csv", index=False)

In [204]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(
    max_depth=3,        # Control tree depth to prevent overfitting
    min_samples_split=5, # Minimum samples required to split a node
    min_samples_leaf=2,  # Minimum samples required at each leaf node
    random_state=46
)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print("Decision Tree Accuracy:", accuracy_score(y_val, dt_pred))
print("\nFeature Importances:")
for feature, importance in zip(X.columns, dt.feature_importances_):
    print(f"{feature}: {importance:.4f}")

# Make submission
submission_pred = dt.predict(test)

df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_pred})
df.to_csv("DecisionTreeSubmission.csv", index=False)

Decision Tree Accuracy: 0.8491620111731844

Feature Importances:
Pclass: 0.1544
Sex: 0.6037
Age: 0.0605
SibSp: 0.0760
Parch: 0.0000
Fare: 0.1053
Embarked: 0.0000


In [205]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

data['FamilySize'] = data['SibSp'] + data['Parch']
test['FamilySize'] = test['SibSp'] + test['Parch']
X = data.drop("Survived", axis=1)
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)

y = data["Survived"]

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=46
)

# KNN Classifier
knn = KNeighborsClassifier(
    n_neighbors=5,      # Number of neighbors to consider
    weights='uniform',  # 'uniform' or 'distance'
    p=2                 # 2 for Euclidean distance, 1 for Manhattan
)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print("KNN Accuracy:", accuracy_score(y_val, knn_pred))

# Find optimal k (optional)
print("\nFinding optimal k:")
for k in range(1, 15):
    knn_temp = KNeighborsClassifier(n_neighbors=k)
    knn_temp.fit(X_train, y_train)
    acc = accuracy_score(y_val, knn_temp.predict(X_val))
    print(f"k={k}: Accuracy={acc:.4f}")

# Make submission with best k
best_k = 7  # Change this based on the above results
final_knn = KNeighborsClassifier(n_neighbors=best_k)
final_knn.fit(X_scaled, y)  # Train on full dataset
submission_pred = final_knn.predict(test_scaled)

df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_pred})
df.to_csv("KNN_Submission.csv", index=False)

KNN Accuracy: 0.8547486033519553

Finding optimal k:
k=1: Accuracy=0.7877
k=2: Accuracy=0.7821
k=3: Accuracy=0.8324
k=4: Accuracy=0.8212
k=5: Accuracy=0.8547
k=6: Accuracy=0.8324
k=7: Accuracy=0.8603
k=8: Accuracy=0.8603
k=9: Accuracy=0.8547
k=10: Accuracy=0.8492
k=11: Accuracy=0.8436
k=12: Accuracy=0.8492
k=13: Accuracy=0.8380
k=14: Accuracy=0.8492
