In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report


In [3]:
# Load the dataset
@st.cache
def load_data():
    data = pd.read_csv("titanic.csv")
    return data

data = load_data()
st.title("Titanic Dataset Analysis")
st.subheader("Overview of the Dataset")
st.write(data.head())


InternalHashError: module '__main__' has no attribute '__file__'

While caching the body of `load_data()`, Streamlit encountered an
object of type `builtins.function`, which it does not know how to hash.

**In this specific case, it's very likely you found a Streamlit bug so please
[file a bug report here.]
(https://github.com/streamlit/streamlit/issues/new/choose)**

In the meantime, you can try bypassing this error by registering a custom
hash function via the `hash_funcs` keyword in @st.cache(). For example:

```
@st.cache(hash_funcs={builtins.function: my_hash_func})
def my_func(...):
    ...
```

If you don't know where the object of type `builtins.function` is coming
from, try looking at the hash chain below for an object that you do recognize,
then pass that to `hash_funcs` instead:

```
Object of type builtins.function: <function load_data at 0x0000022D51A18C20>
```

Please see the `hash_funcs` [documentation](https://docs.streamlit.io/library/advanced-features/caching#the-hash_funcs-parameter)
for more details.
            

In [None]:
st.subheader("Data Exploration")
st.write("### Basic Statistics")
st.write(data.describe())

st.write("### Missing Values")
missing_values = data.isnull().sum()
st.write(missing_values)

# Visualization: Heatmap of missing values
st.write("### Heatmap of Missing Values")
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
st.pyplot(fig)


In [None]:
st.subheader("Data Preparation")

# Fill missing age values with the median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Drop rows with missing Embarked values
data.dropna(subset=['Embarked'], inplace=True)

# Encode categorical variables
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Selected features for analysis
features = ['Pclass', 'Age', 'Fare', 'Sex']
X = data[features]
st.write("### Prepared Data")
st.write(X.head())


In [None]:
st.subheader("K-means Clustering")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_scaled)

# Visualization: Scatter plot of clusters
st.write("### Cluster Visualization")
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x=data['Age'], y=data['Fare'], hue=data['Cluster'], palette='viridis')
plt.title("Clusters Based on Age and Fare")
st.pyplot(fig)


In [None]:
st.subheader("Linear Regression")

# Prepare data for regression
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Model accuracy
accuracy = model.score(X_test, y_test)
st.write(f"### Model Accuracy: {accuracy:.2f}")

# Predictions and classification report
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)

# Display classification report
st.write("### Classification Report")
st.write(pd.DataFrame(report).transpose())

# Visualization: Feature importance
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_[0]})
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients, orient='h')
plt.title("Feature Importance")
st.pyplot(fig)


In [None]:
st.subheader("Interactive Visualization")

# Age filter
min_age, max_age = st.slider("Select Age Range", int(data['Age'].min()), int(data['Age'].max()), (20, 50))
filtered_data = data[(data['Age'] >= min_age) & (data['Age'] <= max_age)]

# Visualization: Filtered data
st.write(f"### Passengers Aged Between {min_age} and {max_age}")
st.write(filtered_data[['Age', 'Fare', 'Pclass', 'Survived']].head())

# Scatter plot for filtered data
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x=filtered_data['Age'], y=filtered_data['Fare'], hue=filtered_data['Survived'], palette='coolwarm')
plt.title(f"Passengers Aged Between {min_age} and {max_age}")
st.pyplot(fig)


In [None]:
st.subheader("Conclusions and Recommendations")
st.write("""
- **Cluster Insights**: Passengers were grouped into clusters based on age, fare, and class. The analysis showed clear groupings based on socio-economic status.
- **Survival Factors**: Logistic regression revealed that women and passengers in higher classes had significantly better chances of survival.
- **Actionable Insights**:
  - Future evacuation protocols could prioritize high-risk groups such as older passengers and those in lower classes.
  - Highlight the importance of socio-economic disparities in survival rates.
""")


In [None]:
streamlit run app.py
