In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/titanic/train.csv")

print(df.head())      # first 5 rows
print(df.shape)       # (rows, columns)
print(df.info())      # column types + non-null counts
print(df.describe())  # numeric summary

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [3]:
print(df.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
df = df.drop(columns=["Cabin", "Ticket"])
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked'],
      dtype='object')


In [5]:
embarked_mode = df["Embarked"].mode()[0]
df["Embarked"] = df["Embarked"].fillna(embarked_mode)
print(df["Embarked"].isna().sum())

0


In [6]:
age_median = df["Age"].median()
df["Age"] = df["Age"].fillna(age_median)
print(df["Age"].isna().sum())

0


In [7]:
print(df.isna().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [8]:
before_rows = len(df)
df = df[df["Fare"] > 0]
after_rows = len(df)

print("Rows removed with non-positive Fare:", before_rows - after_rows)

Rows removed with non-positive Fare: 15


In [9]:
df["Sex"] = df["Sex"].astype("category")
df["Embarked"] = df["Embarked"].astype("category")
df["Pclass"] = df["Pclass"].astype("category")

In [10]:
df["Survived"] = df["Survived"].astype("int8")

In [11]:
print(df.dtypes)

PassengerId       int64
Survived           int8
Pclass         category
Name             object
Sex            category
Age             float64
SibSp             int64
Parch             int64
Fare            float64
Embarked       category
dtype: object


In [12]:
total_passengers = len(df)
total_survived = df["Survived"].sum()
survival_rate = total_survived / total_passengers

print("Total passengers:", total_passengers)
print("Survived:", total_survived)
print("Survival rate:", survival_rate)

Total passengers: 876
Survived: 341
Survival rate: 0.3892694063926941


In [13]:
survival_by_sex = df.groupby("Sex")["Survived"].mean()
print(survival_by_sex)

Sex
female    0.742038
male      0.192171
Name: Survived, dtype: float64


  survival_by_sex = df.groupby("Sex")["Survived"].mean()


In [14]:
survival_by_class = df.groupby("Pclass")["Survived"].mean()
print(survival_by_class)

Pclass
1    0.644550
2    0.488764
3    0.242300
Name: Survived, dtype: float64


  survival_by_class = df.groupby("Pclass")["Survived"].mean()


In [15]:
avg_age_by_survival = df.groupby("Survived")["Age"].mean()
print(avg_age_by_survival)

Survived
0    29.982243
1    28.301085
Name: Age, dtype: float64


In [16]:
df.to_csv("data/titanic/titanic_clean.csv", index=False)

In [17]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
print(df[["SibSp", "Parch", "FamilySize"]].head())

print(df.groupby("FamilySize")["Survived"].mean())

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
FamilySize
1     0.310345
2     0.552795
3     0.578431
4     0.724138
5     0.200000
6     0.136364
7     0.333333
8     0.000000
11    0.000000
Name: Survived, dtype: float64


In [18]:
df["IsAlone"] = (df["FamilySize"] == 1).astype("int8")
print(df[["FamilySize", "IsAlone"]].head())

print(df.groupby("IsAlone")["Survived"].mean())

   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1
IsAlone
0    0.505650
1    0.310345
Name: Survived, dtype: float64


In [19]:
bins = [0, 12, 18, 50, 100]
labels = ["Child", "Teen", "Adult", "Senior"]
df["AgeGroup"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

print(df[["Age", "AgeGroup"]].head())
print(df.groupby("AgeGroup")["Survived"].mean())

    Age AgeGroup
0  22.0    Adult
1  38.0    Adult
2  26.0    Adult
3  35.0    Adult
4  35.0    Adult
AgeGroup
Child     0.573529
Teen      0.488889
Adult     0.367199
Senior    0.364865
Name: Survived, dtype: float64


  print(df.groupby("AgeGroup")["Survived"].mean())


In [20]:
df["FareBucket"] = pd.qcut(df["Fare"], 4, labels=["Low", "Med-Low", "Med-High", "High"])
print(df[["Fare", "FareBucket"]].head())

print(df.groupby("FareBucket")["Survived"].mean())

      Fare FareBucket
0   7.2500        Low
1  71.2833       High
2   7.9250        Low
3  53.1000       High
4   8.0500    Med-Low
FareBucket
Low         0.225664
Med-Low     0.287037
Med-High    0.452055
High        0.600000
Name: Survived, dtype: float64


  print(df.groupby("FareBucket")["Survived"].mean())


In [21]:
ml_columns = [
    "Survived",      # target
    "Pclass",
    "Sex",
    "Age",
    "Fare",
    "Embarked",
    "FamilySize",
    "IsAlone"
]

df_ml = df[ml_columns].copy()
df_ml.to_csv("data/titanic/titanic_ml_ready.csv", index=False)

In [22]:
df = pd.read_csv("data/titanic/titanic_ml_ready.csv")

print(df.head())
print(df.info())

   Survived  Pclass     Sex   Age     Fare Embarked  FamilySize  IsAlone
0         0       3    male  22.0   7.2500        S           2        0
1         1       1  female  38.0  71.2833        C           2        0
2         1       3  female  26.0   7.9250        S           1        1
3         1       1  female  35.0  53.1000        S           2        0
4         0       3    male  35.0   8.0500        S           1        1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876 entries, 0 to 875
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    876 non-null    int64  
 1   Pclass      876 non-null    int64  
 2   Sex         876 non-null    object 
 3   Age         876 non-null    float64
 4   Fare        876 non-null    float64
 5   Embarked    876 non-null    object 
 6   FamilySize  876 non-null    int64  
 7   IsAlone     876 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory u

In [23]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
target_col = "Survived"

feature_cols = [
    "Pclass",
    "Sex",
    "Age",
    "Fare",
    "Embarked",
    "FamilySize",
    "IsAlone"
]

X = df[feature_cols].copy()
y = df[target_col]

In [25]:
X_encoded = pd.get_dummies(X, drop_first=True)

print(X_encoded.head())
print(X_encoded.columns)
print(X_encoded.shape)

   Pclass   Age     Fare  FamilySize  IsAlone  Sex_male  Embarked_Q  \
0       3  22.0   7.2500           2        0      True       False   
1       1  38.0  71.2833           2        0     False       False   
2       3  26.0   7.9250           1        1     False       False   
3       1  35.0  53.1000           2        0     False       False   
4       3  35.0   8.0500           1        1      True       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  
Index(['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Sex_male',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')
(876, 8)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # keep class balance similar in train & test
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (700, 8) (700,)
Test shape: (176, 8) (176,)


In [27]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [28]:
y_pred = model.predict(X_test)

In [29]:
y_proba = model.predict_proba(X_test)[:, 1]
print(y_proba[:10])

[0.9233824  0.10711912 0.21003768 0.81756841 0.8473832  0.18403561
 0.16476923 0.23173026 0.65654064 0.39019616]


In [30]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.8295454545454546


In [31]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [17 52]]


In [32]:
cm_df = pd.DataFrame(
    cm,
    index=["Actual_0", "Actual_1"],
    columns=["Pred_0", "Pred_1"]
)
print(cm_df)

          Pred_0  Pred_1
Actual_0      94      13
Actual_1      17      52
