In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train.csv")

### Data description

| Feature  | Description            | Notes                                      |
|----------|------------------------|--------------------------------------------|
| survived | Survival               | 0 = No, 1 = Yes                            |
| pclass   | Ticket class           | 1 = 1st, 2 = 2nd, 3 = 3rd                 |
| sex      | Sex                    |                                            |
| Age      | Age in years           |                                            |
| sibsp    | # of siblings / spouses aboard the Titanic |                                    |
| parch    | # of parents / children aboard the Titanic |                                    |
| ticket   | Ticket number          |                                            |
| fare     | Passenger fare         |                                            |
| cabin    | Cabin number           |                                            |
| embarked | Port of Embarkation    | C = Cherbourg, Q = Queenstown, S = Southampton |


In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
train.drop(["PassengerId","Cabin","Name","Ticket"],axis=1, inplace=True)

In [None]:
train.head()

In [None]:
train['Sex'].value_counts()

In [None]:
train['Embarked'].value_counts()

In [None]:
train['Age'] = train['Age'].fillna(train['Age'].mean())

In [None]:
train.describe()

In [None]:
train["Survived"].value_counts()

In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set the style of seaborn plots
sns.set_style("whitegrid")

# Histogram: Survived vs Age
plt.figure(figsize=(10, 6))
sns.histplot(data=train, x='Age', hue='Survived', multiple='stack', bins=20, kde=True)
plt.title('Survived vs Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
# Histogram: Survived vs Fare
plt.figure(figsize=(10, 6))
sns.histplot(data=train, x='Fare', hue='Survived', multiple='stack', bins=20, kde=True)
plt.title('Survived vs Fare')
plt.xlabel('Fare')
plt.ylabel('Count')
plt.show()

In [None]:
# Define the categorical variables
categorical_vars = ['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']

# Create count plots for each categorical variable
for var in categorical_vars:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=train, x=var, hue='Survived')
    plt.title(f'Survived vs {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.show()

In [None]:
# Bar plot: Fare vs Embarked
plt.figure(figsize=(10, 6))
sns.barplot(data=train, x='Embarked', y='Fare')
plt.title('Fare vs Embarked')
plt.xlabel('Embarked')
plt.ylabel('Fare')
plt.show()

In [None]:
train.head()

In [None]:
print(train['Sex'].value_counts())
print(train['Embarked'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train['Sex'] = label_encoder.fit_transform(train['Sex'])
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

In [None]:
print(train['Sex'].value_counts())
print(train['Embarked'].value_counts())

In [None]:
train.head()

In [None]:
X=train.drop(["Survived"],axis=1)
y=train['Survived']

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_params = {
    
    'logistic_regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 0.5, 1, 5, 10]
        }
    },
    
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    }
    
}


In [None]:
from sklearn.model_selection import GridSearchCV
scores=[]
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

best_model = pd.DataFrame(scores,columns=['model','best_score','best_params'])
best_model

In [None]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score, classification_report

# Retrieve the best model and its parameters from the results
best_model_row = best_model.loc[best_model['best_score'].idxmax()]
best_model_name = best_model_row['model']
best_model_params = best_model_row['best_params']

# Initialize the best model with the best parameters
best_model_selected = model_params[best_model_name]['model']
best_model_selected.set_params(**best_model_params)

best_model_selected.fit(X_train,y_train)
# Make predictions on the test set
y_pred_test = best_model_selected.predict(X_test)

# Evaluate performance on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Print evaluation metrics and other relevant information
print("Test Set Accuracy:", accuracy_test)
print("Test Set Precision:", precision_test)
print("Test Set Recall:", recall_test)
print("Test Set F1-Score:", f1_test)
print("Classification Report for Test Set:")
print(report_test)
print("Confusion Matrix for Test Set:")
print(conf_matrix)

In [None]:
import pickle

# Save the trained SVM model to a file
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model_selected, f)