## Load the Dataset

In [1]:
import pandas as pd
import numpy as np

# Load training dataset
train = pd.read_csv(r"C:\Users\Gairo\Downloads\titanic\train.csv")


# Load test dataset
test = pd.read_csv(r"C:\Users\Gairo\Downloads\titanic\test.csv")


In [2]:
# First 5 rows
print(train.head())

# Shape of datasets
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Info about columns
print(train.info())

# Summary stats (numerical features)
print(train.describe())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Tr

In [3]:
train.head(10)   # Shows first 10 rows in table format

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist()) # Target label (like: Survived) can,t be present in test set


Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
Test columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [4]:
#Shape of dataset
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [5]:
#Check missing values
train.isnull().sum() #column related null values 
train.isnull().sum().sum() #total values

np.int64(866)

In [7]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Data preprocessing and cleaning

In [6]:
# Combine them for cleaning. we'll add a flag to separate them later.
train['IsTrainSet'] = True;
test['IsTrainSet'] = False;

# Also, the test set doesn't have the 'Survived' column, so we add it as NaN to allow concatenation
test['Survived'] = np.nan

# Combine into one DataFrame
titanic = pd.concat([train,test],axis=0)

# Check the shape: (number_of_rows, number_of_columns)
print(titanic.shape)


(1309, 13)


In [7]:
print(titanic.isnull().sum())

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
IsTrainSet        0
dtype: int64


## Deeper Data Cleaning & Preprocessing

In [8]:
# Extract the title (e.g., 'Mr', 'Miss', 'Master') from the name
titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Check the counts
print(titanic['Title'].value_counts())
# Group rare titles into 'Rare'
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic['Title'] = titanic['Title'].replace('Mlle', 'Miss')
titanic['Title'] = titanic['Title'].replace('Ms', 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')

#After handling rare values
print(titanic['Title'].value_counts())

Title
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Major         2
Mlle          2
Ms            2
Mme           1
Don           1
Sir           1
Lady          1
Capt          1
Countess      1
Jonkheer      1
Dona          1
Name: count, dtype: int64
Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: count, dtype: int64


  titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [9]:
# Fill missing Age with median age per Title
titanic['Age'] = titanic.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

In [10]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1 # +1 for the passenger themselves

In [11]:
#Passenger alone on the board
titanic['IsAlone'] = 1
titanic['IsAlone'].loc[titanic['FamilySize'] > 1] = 0 # If family size > 1, then not alone

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  titanic['IsAlone'].loc[titanic['FamilySize'] > 1] = 0 # If family size > 1, then not alone
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [12]:
titanic['Deck'] = titanic['Cabin'].str[0] # Get the first letter
titanic['Deck'] = titanic['Deck'].fillna('N') # Fill NaN with 'N' for None

In [13]:
#Drop unnecessary columns
columns_to_drop = ['Name', 'Ticket', 'Cabin', 'PassengerId']
titanic_clean = titanic.drop(columns=columns_to_drop)

In [14]:
# Perform one-hot encoding
Titanic = pd.get_dummies(titanic_clean , columns=['Sex', 'Embarked', 'Title', 'Deck'], prefix=['Sex', 'Emb', 'Title', 'Deck'])

# Check the new data types and columns
print(Titanic.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    float64
 1   Pclass        1309 non-null   int64  
 2   Age           1309 non-null   float64
 3   SibSp         1309 non-null   int64  
 4   Parch         1309 non-null   int64  
 5   Fare          1308 non-null   float64
 6   IsTrainSet    1309 non-null   bool   
 7   FamilySize    1309 non-null   int64  
 8   IsAlone       1309 non-null   int64  
 9   Sex_female    1309 non-null   bool   
 10  Sex_male      1309 non-null   bool   
 11  Emb_C         1309 non-null   bool   
 12  Emb_Q         1309 non-null   bool   
 13  Emb_S         1309 non-null   bool   
 14  Title_Master  1309 non-null   bool   
 15  Title_Miss    1309 non-null   bool   
 16  Title_Mr      1309 non-null   bool   
 17  Title_Mrs     1309 non-null   bool   
 18  Title_Rare    1309 non-null   bool

In [15]:
# Split the data back into train and test
train_clean = full_df_clean[full_df_clean['IsTrainSet'] == True].drop('IsTrainSet', axis=1)
test_clean = full_df_clean[full_df_clean['IsTrainSet'] == False].drop('IsTrainSet', axis=1)

# Separate the features (X) and target (y) for the training set
# The target 'Survived' was only present in the training set
X_train = train_clean.drop('Survived', axis=1)
y_train = train_clean['Survived']

# The test set for submission (X_test) is everything else
X_test = test_clean.drop('Survived', axis=1)

NameError: name 'full_df_clean' is not defined