In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [120]:
# Read training and testing datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [121]:
# Display the shape of the training dataset and its first few rows
print("Shape of train_df:", train_df.shape)
print("First rows of train_df:")
train_df.head()

Shape of train_df: (891, 12)
First rows of train_df:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [122]:
# Display the shape of the testing dataset and its first few rows
print("shape:", test_df.shape)
print("\nFirst rows of test_df:")
test_df.head()

shape: (418, 11)

First rows of test_df:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [123]:
# Combine the training and testing datasets
combine_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [124]:
# Check for null values in the combined dataset
null_values = combine_df.isnull().sum()

# Display columns with null values and their corresponding counts
print("Columns with null values:")
print(null_values[null_values > 0])

Columns with null values:
Survived     418
Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64


In [125]:
# Drop 'Cabin' column
combine_df.drop(columns=["Cabin"], inplace=True)

# Fill missing values in other columns
combine_df["Age"] = combine_df["Age"].fillna(combine_df["Age"].median())
combine_df["Fare"] = combine_df["Fare"].fillna(combine_df["Fare"].median())
combine_df["Embarked"] = combine_df["Embarked"].fillna(combine_df["Embarked"].mode()[0])

# Check for null values again to confirm
null_values = combine_df.isnull().sum()
print("Columns with null values after processing:")
print(null_values[null_values > 0])

Columns with null values after processing:
Survived    418
dtype: int64


In [126]:
# Drop 'Ticket', 'Name', and 'PassengerId' columns
combine_df.drop(columns=["Ticket", "Name", "PassengerId"], inplace=True)

In [127]:
# Encode categorical columns ('Sex' and 'Embarked') using LabelEncoder
label_encoder = LabelEncoder()

combine_df["Sex"] = label_encoder.fit_transform(combine_df["Sex"])
combine_df["Embarked"] = label_encoder.fit_transform(combine_df["Embarked"])

In [128]:
# Display the encoded DataFrame
print("\nEncoded DataFrame:")
combine_df.head()


Encoded DataFrame:


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3,1,22.0,1,0,7.25,2
1,1.0,1,0,38.0,1,0,71.2833,0
2,1.0,3,0,26.0,0,0,7.925,2
3,1.0,1,0,35.0,1,0,53.1,2
4,0.0,3,1,35.0,0,0,8.05,2


In [129]:
# Split the combined DataFrame back into training and testing datasets
train_data = combine_df[: len(train_df)]
test_data = combine_df[len(train_df):]

# Define features (X) and target variable (y) for training dataset
X = train_data.drop(columns=["Survived"])
y = train_data["Survived"]

In [130]:
# Perform train-test split on training dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [131]:
# Define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(C=1, max_iter=100, penalty='l2', solver='liblinear'),
    "k-Nearest Neighbors": KNeighborsClassifier(algorithm='ball_tree', n_neighbors=5, weights='distance'),
    "Support Vector Machine": SVC(C=1, gamma="scale", kernel="linear"),
    "Decision Tree": DecisionTreeClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=3),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(alpha=0.0001)
}

In [132]:
# Evaluate each classifier using cross-validation
classifier_scores = {}
for clf_name, clf in classifiers.items():
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    classifier_scores[clf_name] = scores.mean()
    print(f"{clf_name} Cross-Validation Accuracy: {scores.mean()}")

Logistic Regression Cross-Validation Accuracy: 0.8008641005498822
k-Nearest Neighbors Cross-Validation Accuracy: 0.7261025698574796
Support Vector Machine Cross-Validation Accuracy: 0.7859499495006171
Decision Tree Cross-Validation Accuracy: 0.7918078779037144
Random Forest Cross-Validation Accuracy: 0.824812030075188
Gradient Boosting Cross-Validation Accuracy: 0.8203680843900797
Naive Bayes Cross-Validation Accuracy: 0.7978341375827629
Perceptron Cross-Validation Accuracy: 0.6992032319604983


In [133]:
# Choose the best-performing classifier
best_classifier_name = max(
    classifiers,
    key=lambda k: cross_val_score(
        classifiers[k], X_train, y_train, cv=5, scoring="accuracy"
    ).mean(),
)
best_classifier = classifiers[best_classifier_name]

In [97]:
# Fit the best classifier using the full training data
best_classifier.fit(X, y)

In [108]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
891,3,1,34.5,0,0,7.8292,1
892,3,0,47.0,1,0,7.0,2
893,2,1,62.0,0,0,9.6875,1
894,3,1,27.0,0,0,8.6625,2
895,3,0,22.0,1,1,12.2875,2


In [None]:
# Prepare test data for prediction
test_data = test_data.drop(columns=["Survived"])

In [115]:
# Predict survival outcomes for the test data
pred = best_classifier.predict(test_data)
pred

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [111]:
# Read submission file
sub = pd.read_csv("gender_submission.csv")

In [112]:
# Update 'Survived' column with predictions
sub["Survived"] = pred.astype(int)

In [113]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [114]:
# Save the submission DataFrame to a CSV file
sub.to_csv('submission.csv',index=False)

In [135]:
subs = pd.read_csv("submission.csv")

In [137]:
subs.head()

Unnamed: 0,PassengerId,Survived,Survived-org
0,892,0,0
1,893,0,1
2,894,0,0
3,895,0,0
4,896,0,1


In [138]:
# Calculate accuracy
accuracy = (subs["Survived"] == subs["Survived-org"]).mean()
print("Accuracy:", accuracy)

Accuracy: 0.8708133971291866
