# TITANIC SURVIVAL PREDICTION PROJECT

In [None]:
!conda install conda-forge::streamlit -y
conda install anaconda::joblib -y

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import streamlit as st


## 1. Data Loading & Exploration

In [None]:
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
df = pd.read_csv(url)

print("=== Initial Exploration ===")
print("Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())
print("\nInfo:")
df.info()
print("\nSummary stats:")
display(df.describe(include='all'))


## 2. Univariate & Multivariate Analysis

In [None]:
# Create a 2x2 subplot grid with Plotly
fig = make_subplots(rows=2, cols=2, subplot_titles=('Survival Distribution', 'Age Distribution', 'Fare Distribution', 'Correlation Matrix'))

# Survival count plot (replacing sns.countplot)
survived_counts = df['Survived'].value_counts()
fig.add_trace(
    go.Bar(x=['Did not survive', 'Survived'], y=[survived_counts.get(0, 0), survived_counts.get(1, 0)], name='Survived'),
    row=1, col=1
)

# Age distribution with histogram and KDE (replacing sns.histplot)
age = df['Age'].dropna()
kde = gaussian_kde(age)
x = np.linspace(age.min(), age.max(), 100)
kde_values = kde(x)
fig.add_trace(
    go.Histogram(x=age, nbinsx=30, histnorm='probability density', name='Age'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=x, y=kde_values, mode='lines', name='KDE'),
    row=1, col=2
)

# Fare boxplot (replacing sns.boxplot)
fig.add_trace(
    go.Box(x=df['Fare'], name='Fare'),
    row=2, col=1
)

# Correlation heatmap (replacing sns.heatmap)
corr_matrix = df.select_dtypes(include=[np.number]).corr()
fig.add_trace(
    go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='RdBu'),
    row=2, col=2
)

# Update layout and display
fig.update_layout(height=800, width=800, title_text="Univariate and Multivariate Analysis")
fig.show()

## 3. Data Assessment

In [None]:
print("\n=== Data Quality Issues ===")
print("Missing values:")
print(df.isnull().sum()[df.isnull().sum() > 0])
print("\nDuplicate rows:", df.duplicated().sum())

## 4. Data Cleaning

In [None]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop('Cabin', axis=1, inplace=True)

# Handle outliers
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['Fare'] >= Q1 - 1.5*IQR) & (df['Fare'] <= Q3 + 1.5*IQR)]

# Drop duplicates
df.drop_duplicates(inplace=True)

print("\nAfter cleaning:")
print("Missing values remaining:", df.isnull().sum().sum())
print("New shape:", df.shape)

## 5. Post-Cleaning Visualization

In [None]:
# Create a 1x2 subplot grid with Plotly
fig2 = make_subplots(rows=1, cols=2, subplot_titles=('Updated Correlation Matrix', 'Survival by Passenger Class'))

# Updated correlation heatmap (replacing sns.heatmap)
corr_matrix_clean = df.select_dtypes(include=[np.number]).corr()
fig2.add_trace(
    go.Heatmap(z=corr_matrix_clean.values, x=corr_matrix_clean.columns, y=corr_matrix_clean.columns, colorscale='viridis'),
    row=1, col=1
)

# Survival by Pclass grouped bar chart (replacing sns.countplot with hue)
survival_by_pclass = df.groupby('Pclass')['Survived'].value_counts().unstack().fillna(0)
fig2.add_trace(
    go.Bar(x=survival_by_pclass.index, y=survival_by_pclass[0], name='Did not survive'),
    row=1, col=2
)
fig2.add_trace(
    go.Bar(x=survival_by_pclass.index, y=survival_by_pclass[1], name='Survived'),
    row=1, col=2
)

# Update layout with grouped bars and display
fig2.update_layout(barmode='group', height=400, width=800, title_text="Post-Cleaning Visualizations")
fig2.show()

## 6. Data Preprocessing

In [None]:
# Feature engineering
df['FamilySize'] = df['SibSp'] + df['Parch']

# Define features and target
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Fare', 'FamilySize']),
        ('cat', OneHotEncoder(), ['Pclass', 'Sex', 'Embarked'])
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 7. Model Training

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(preprocessor.fit_transform(X_train), y_train)

## 8. Model Evaluation

In [None]:
y_pred = model.predict(preprocessor.transform(X_test))

print("\n=== Model Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 9. Model Saving

In [None]:
joblib.dump(model, 'titanic_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

## STREAMLIT DEPLOYMENT CODE

In [None]:
%%writefile st_app.py


import streamlit as st
import joblib
import pandas as pd

# Load artifacts
model = joblib.load('titanic_model.pkl')
preprocessor = joblib.load('preprocessor.pkl')

# App interface
st.title('Titanic Survival Predictor')

# Input widgets
col1, col2 = st.columns(2)
with col1:
    pclass = st.selectbox('Passenger Class', [1, 2, 3])
    sex = st.selectbox('Sex', ['male', 'female'])
    age = st.number_input('Age', min_value=0, max_value=100, value=30)
    
with col2:
    fare = st.number_input('Fare', min_value=0, value=50)
    embarked = st.selectbox('Embarked', ['C', 'Q', 'S'])
    family_size = st.number_input('Family Size', min_value=0, max_value=10, value=0)

# Prediction logic
if st.button('Predict Survival'):
    input_data = pd.DataFrame([[pclass, sex, age, fare, embarked, family_size]],
                            columns=['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize'])
    
    processed_data = preprocessor.transform(input_data)
    prediction = model.predict(processed_data)[0]
    probability = model.predict_proba(processed_data)[0][1]
    
    st.subheader('Result')
    st.metric("Survival Probability", f"{probability:.1%}")
    st.write(f"Prediction: {'Survived' if prediction == 1 else 'Did not survive'}")


# print("\n=== Streamlit App Code ===")
# print("Save this as 'app.py' and run with: streamlit run app.py")
# print(streamlit_code)