**1. Data Collection and Understanding**





In [1]:
import pandas as pd


In [2]:
# Load dataset
titanic = pd.read_csv('/content/Titanic-Dataset.csv')

In [3]:
# First few rows
print(titanic.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
# Dataset information
print(titanic.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [5]:
# Summary statistics
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [6]:
# Check missing values
print(titanic.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


**2. Handling Missing Data**


In [7]:
# 'Age': Fill missing values with median (safe for skewed data).

# 'Embarked': Fill missing values with mode (most frequent category).

# 'Cabin': Drop because too much missingness.

In [8]:
# Fill 'Age' missing values with median
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Age'].fillna(titanic['Age'].median(), inplace=True)


In [9]:
# Fill 'Embarked' missing values with mode
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)


In [10]:
# Drop 'Cabin' column
titanic.drop(columns=['Cabin'], inplace=True)

**3. Removing Outliers (Optional but Recommended)**

In [11]:
# Cap extreme values in 'Fare' and 'Age' to reduce noise.

In [12]:
import numpy as np

In [13]:
# Cap Fare at 99th percentile
fare_cap = titanic['Fare'].quantile(0.99)
titanic['Fare'] = np.where(titanic['Fare'] > fare_cap, fare_cap, titanic['Fare'])

In [14]:
# Cap Age similarly (optional)
age_cap = titanic['Age'].quantile(0.99)
titanic['Age'] = np.where(titanic['Age'] > age_cap, age_cap, titanic['Age'])

**4. Encoding Categorical Data**


In [15]:
# Convert categorical features to numerical format for machine learning.

In [16]:
# 'Sex': Label encode (0/1).

# 'Embarked': One-hot encode (create dummy variables).

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
# Label encode 'Sex'
le_sex = LabelEncoder()
titanic['Sex'] = le_sex.fit_transform(titanic['Sex'])

In [19]:
# One-hot encode 'Embarked'
titanic = pd.get_dummies(titanic, columns=['Embarked'], drop_first=True)

**5. Feature Scaling: Normalisation and Standardisation**

In [20]:
from sklearn.preprocessing import StandardScaler


In [21]:
scaler = StandardScaler()


In [22]:
# Scale 'Age' and 'Fare'
titanic[['Age', 'Fare']] = scaler.fit_transform(titanic[['Age', 'Fare']])

**6. Feature Engineering (Optional Enhancement)**

In [23]:
# Create new features that capture useful information.

In [24]:
# Create FamilySize feature
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

In [25]:
# Create IsAlone feature
titanic['IsAlone'] = 1  # Default to alone
titanic['IsAlone'].loc[titanic['FamilySize'] > 1] = 0


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  titanic['IsAlone'].loc[titanic['FamilySize'] > 1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic['

In [26]:
# Drop irrelevant columns
titanic.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)


**Final Preprocessed Data**

In [27]:
print(titanic.head())

   Survived  Pclass  Sex       Age  SibSp  Parch      Fare  Embarked_Q  \
0         0       3    1 -0.569196      1      0 -0.564109       False   
1         1       1    0  0.677902      1      0  0.942548       False   
2         1       3    0 -0.257421      0      0 -0.548227       False   
3         1       1    0  0.444071      1      0  0.514708       False   
4         0       3    1  0.444071      0      0 -0.545285       False   

   Embarked_S  FamilySize  IsAlone  
0        True           2        0  
1       False           2        0  
2        True           1        1  
3        True           2        0  
4        True           1        1  


In [28]:
print(titanic.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked_Q  891 non-null    bool   
 8   Embarked_S  891 non-null    bool   
 9   FamilySize  891 non-null    int64  
 10  IsAlone     891 non-null    int64  
dtypes: bool(2), float64(2), int64(7)
memory usage: 64.5 KB
None


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [30]:
# Features and target
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

In [31]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [32]:
# Initialize and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [33]:
# Predict
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


In [34]:
# Evaluate
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.8033707865168539
Test Accuracy: 0.8044692737430168
[[90 15]
 [20 54]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179

