<a href="https://colab.research.google.com/github/Mukeshreddy3699/INFO5810/blob/main/extracredit2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
df = pd.read_excel("/content/TitanicPassengers.xlsx")

print("Proportion survived:\n", df["Survived"].value_counts(normalize=True))

# Age range
print("Age range:", df["Age"].min(), "-", df["Age"].max())

Proportion survived:
 Survived
0    0.618029
1    0.381971
Name: proportion, dtype: float64
Age range: 0.1667 - 80.0


In [3]:
# Check all column names
print(df.columns)

# Option 1: Use exact column name
most_common_home = df['Home / Destination'].mode()[0]
print("Most common home destination:", most_common_home)

# Option 2: Clean column names first (recommended)
df.columns = df.columns.str.strip()  # remove leading/trailing spaces
most_common_home = df['Home / Destination'].mode()[0]
print("Most common home destination:", most_common_home)



Index(['Passenger Class', 'Survived', 'Name', 'Sex', 'Age',
       'Siblings and Spouses', 'Parents and Children', 'Ticket #', 'Fare',
       'Cabin', 'Port', 'Lifeboat', 'Body', 'Home / Destination',
       'Midpoint age'],
      dtype='object')
Most common home destination: New York, NY
Most common home destination: New York, NY


In [9]:
df_encoded = df.copy()
# Encode categorical variables
df_encoded['Passenger Class'] = df_encoded['Passenger Class'].astype(int)

# Features for modeling
X = df_encoded[['Age', 'Passenger Class', 'Sex']]

# Predict specific passengers
female_baby = pd.DataFrame({'Age':[5], 'Passenger Class':[1], 'Sex':[0]})
male_40 = pd.DataFrame({'Age':[40], 'Passenger Class':[3], 'Sex':[1]})


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Encode categorical variables
df_encoded = df.copy()
df_encoded['Sex'] = LabelEncoder().fit_transform(df_encoded['Sex'])  # female=0, male=1
df_encoded['Passenger Class'] = df_encoded['Passenger Class'].astype(int)

# Drop rows with missing Age
df_encoded = df_encoded.dropna(subset=['Age', 'Sex', 'Passenger Class', 'Survived'])

# Features and target
X = df_encoded[['Age', 'Passenger Class', 'Sex']]
y = df_encoded['Survived']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")


Accuracy: 0.74, Precision: 0.71, Recall: 0.66, F1: 0.68


In [6]:
# Female baby (<10) first-class
female_baby = pd.DataFrame({'Age':[5], 'Passenger Class':[1], 'Sex':[0]})

# 40-year-old male third-class
male_40 = pd.DataFrame({'Age':[40], 'Passenger Class':[3], 'Sex':[1]})

# Predict probabilities
chance_female_baby = log_model.predict_proba(female_baby)[0][1]  # probability of survival
chance_male_40 = log_model.predict_proba(male_40)[0][1]

print("Chance female baby first-class survived:", round(chance_female_baby, 2))
print("Chance 40-year-old male third-class survived:", round(chance_male_40, 2))


Chance female baby first-class survived: 0.97
Chance 40-year-old male third-class survived: 0.06


In [7]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Decision Tree with 3 variables
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_tree = dt_model.predict(X_test)

# Metrics
accuracy_tree = accuracy_score(y_test, y_pred_tree)
precision_tree = precision_score(y_test, y_pred_tree)
recall_tree = recall_score(y_test, y_pred_tree)
f1_tree = f1_score(y_test, y_pred_tree)

print(f"Decision Tree - Accuracy: {accuracy_tree:.2f}, Precision: {precision_tree:.2f}, Recall: {recall_tree:.2f}, F1: {f1_tree:.2f}")

# Feature importance
importance = pd.Series(dt_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature importance:\n", importance)



Decision Tree - Accuracy: 0.71, Precision: 0.69, Recall: 0.59, F1: 0.63
Feature importance:
 Sex                0.481878
Age                0.353632
Passenger Class    0.164489
dtype: float64


In [8]:
# Feature importance from decision tree
importance = pd.Series(dt_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature importance:\n", importance)

# Print the variable that contributes the most
most_important_var = importance.idxmax()
print("The variable the most predicting is:", most_important_var)


Feature importance:
 Sex                0.481878
Age                0.353632
Passenger Class    0.164489
dtype: float64
The variable the most predicting is: Sex
