<a href="https://www.kaggle.com/code/harshitpatelnvm/titanic-solution-2?scriptVersionId=297786844" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression



In [2]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
train_df = train_df.drop(columns=['PassengerId'])


In [4]:
train_df['Age_Group'] = 'Unknown'

train_df.loc[train_df['Age'] <= 10, 'Age_Group'] = 'Kid'
train_df.loc[(train_df['Age'] > 10) & (train_df['Age'] <= 20), 'Age_Group'] = 'Teenager'
train_df.loc[(train_df['Age'] > 20) & (train_df['Age'] <= 50), 'Age_Group'] = 'Adult'
train_df.loc[train_df['Age'] > 50, 'Age_Group'] = 'Old'

test_df['Age_Group'] = 'Unknown'

test_df.loc[test_df['Age'] <= 10, 'Age_Group'] = 'Kid'
test_df.loc[(test_df['Age'] > 10) & (test_df['Age'] <= 20), 'Age_Group'] = 'Teenager'
test_df.loc[(test_df['Age'] > 20) & (test_df['Age'] <= 50), 'Age_Group'] = 'Adult'
test_df.loc[test_df['Age'] > 50, 'Age_Group'] = 'Old'


In [5]:
train_df['M/M'] = 'Unknown'

train_df.loc[train_df['Age'] <= 18, 'M/M'] = 'Minor'
train_df.loc[train_df['Age'] > 18, 'M/M'] = 'Major'


test_df['M/M'] = 'Unknown'

test_df.loc[test_df['Age'] <= 18, 'M/M'] = 'Minor'
test_df.loc[test_df['Age'] > 18, 'M/M'] = 'Major'


In [6]:
train_df["CabinLetter"] = train_df["Cabin"].str[0]
train_df["Cabin"] = train_df["Cabin"].str[1]
train_df["CabinLetter"]=train_df["CabinLetter"].fillna("U")
test_df["CabinLetter"] = test_df["Cabin"].str[0]
test_df["Cabin"] = test_df["Cabin"].str[1]
test_df["CabinLetter"]=test_df["CabinLetter"].fillna("U")



In [7]:

train_df = train_df.drop(columns=['Cabin'])


In [8]:
train_df['Class'] = 'Mid'

train_df.loc[train_df['Fare'] == 0, 'Class'] = 'Free'
train_df.loc[train_df['Fare'] > 32, 'Class'] = 'Rich'


test_df['Class'] = 'Mid'

test_df.loc[test_df['Fare'] == 0, 'Class'] = 'Free'
test_df.loc[test_df['Fare'] > 32, 'Class'] = 'Rich'


In [9]:
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1


In [10]:
def family_group(size):
    if size == 1:
        return "Alone"
    elif size <= 4:
        return "Small"
    else:
        return "Large"
train_df["FamilySizeGrouped"] = train_df["FamilySize"].apply(family_group)
test_df["FamilySizeGrouped"] = test_df["FamilySize"].apply(family_group)


In [11]:
train_df['Title'] = (
    train_df['Name']
    .str.split(",", expand=True)[1]
    .str.split(".", expand=True)[0]
    .str.strip()
)
test_df['Title'] = (
    test_df['Name']
    .str.split(",", expand=True)[1]
    .str.split(".", expand=True)[0]
    .str.strip()
)
train_df['Title'] = train_df['Title'].replace({
    'Capt': 'Military',
    'Col': 'Military',
    'Major': 'Military',
    'Jonkheer': 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble'    
})

test_df['Title'] = test_df['Title'].replace({
    'Capt': 'Military',
    'Col': 'Military',
    'Major': 'Military',
    'Jonkheer': 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble'    
})

In [12]:
train_df['Name_Length'] = train_df['Name'].apply(lambda x: len(x))
test_df['Name_Length'] = test_df['Name'].apply(lambda x: len(x))

In [13]:

train_df = train_df.drop(columns=['Name'])


In [14]:
import re

def get_ticket_prefix(ticket):
    prefix = re.findall(r'^[A-Za-z./]+', ticket)
    if prefix:
        return prefix[0].replace('.', '').replace('/', '')
    else:
        return "NoPrefix"

train_df["TicketPrefix"] = train_df["Ticket"].apply(get_ticket_prefix)
test_df["TicketPrefix"] = test_df["Ticket"].apply(get_ticket_prefix)



In [15]:
train_df["TicketNumLen"] = train_df["Ticket"].apply(
    lambda x: len(''.join([ch for ch in x if ch.isdigit()]))
)
test_df["TicketNumLen"] = test_df["Ticket"].apply(
    lambda x: len(''.join([ch for ch in x if ch.isdigit()]))
)


In [16]:
ticket_counts = train_df["Ticket"].value_counts()
train_df["TicketSharedCount"] = train_df["Ticket"].map(ticket_counts)
test_df["TicketSharedCount"] = test_df["Ticket"].map(ticket_counts).fillna(0)


In [17]:
train_df["TicketNumber"] = train_df["Ticket"].str.extract(r'(\d+)').astype(float)
test_df["TicketNumber"] = test_df["Ticket"].str.extract(r'(\d+)').astype(float)


In [18]:

train_df = train_df.drop(columns=['Ticket'])


In [19]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Group,M/M,CabinLetter,Class,FamilySize,FamilySizeGrouped,Title,Name_Length,TicketPrefix,TicketNumLen,TicketSharedCount,TicketNumber
0,0,3,male,22.0,1,0,7.2500,S,Adult,Major,U,Mid,2,Small,Mr,23,A,6,1,5.0
1,1,1,female,38.0,1,0,71.2833,C,Adult,Major,C,Rich,2,Small,Mrs,51,PC,5,1,17599.0
2,1,3,female,26.0,0,0,7.9250,S,Adult,Major,U,Mid,1,Alone,Miss,22,STONO,8,1,2.0
3,1,1,female,35.0,1,0,53.1000,S,Adult,Major,C,Rich,2,Small,Mrs,44,NoPrefix,6,2,113803.0
4,0,3,male,35.0,0,0,8.0500,S,Adult,Major,U,Mid,1,Alone,Mr,24,NoPrefix,6,1,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Adult,Major,U,Mid,1,Alone,Rev,21,NoPrefix,6,1,211536.0
887,1,1,female,19.0,0,0,30.0000,S,Teenager,Major,B,Mid,1,Alone,Miss,28,NoPrefix,6,1,112053.0
888,0,3,female,,1,2,23.4500,S,Unknown,Unknown,U,Mid,4,Small,Miss,40,WC,4,2,6607.0
889,1,1,male,26.0,0,0,30.0000,C,Adult,Major,C,Mid,1,Alone,Mr,21,NoPrefix,6,1,111369.0


In [20]:
train_df["Age"] = train_df["Age"].fillna(0)
train_df["TicketNumber"] = train_df["TicketNumber"].fillna(0)

In [21]:
test_df["Cabin"] = test_df["Cabin"].fillna(0)
test_df["Age"] = test_df["Age"].fillna(0)
test_df["Fare"] = test_df["Fare"].fillna(0)


In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PassengerId        418 non-null    int64  
 1   Pclass             418 non-null    int64  
 2   Name               418 non-null    object 
 3   Sex                418 non-null    object 
 4   Age                418 non-null    float64
 5   SibSp              418 non-null    int64  
 6   Parch              418 non-null    int64  
 7   Ticket             418 non-null    object 
 8   Fare               418 non-null    float64
 9   Cabin              418 non-null    object 
 10  Embarked           418 non-null    object 
 11  Age_Group          418 non-null    object 
 12  M/M                418 non-null    object 
 13  CabinLetter        418 non-null    object 
 14  Class              418 non-null    object 
 15  FamilySize         418 non-null    int64  
 16  FamilySizeGrouped  418 non

In [23]:
X = train_df.drop("Survived",axis=1)
y = train_df["Survived"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,stratify=y,random_state=21)

In [25]:
ord_cols = ["Pclass","Age_Group","Class"]
num_cols = ["Age","Fare"]
ohe_cols = ["Sex","M/M","Embarked","CabinLetter","Title","TicketPrefix","FamilySizeGrouped"]



In [26]:
ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    
    ('ord', OrdinalEncoder(
        categories=[
            [3, 2, 1],                         
            ['Kid', 'Teenager', 'Adult', 'Old','Unknown'],   
            ['Rich','Mid','Free'] 
        ],
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

In [27]:
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(
        handle_unknown='ignore',
        drop='first',          
        sparse_output=False
    ))
])


In [28]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])


In [29]:


col_trans = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('ord', ordinal_pipeline, ord_cols),
        ('ohe', ohe_pipeline, ohe_cols)
    ],
    remainder='passthrough'   
)


In [30]:
linereqpipe = make_pipeline(col_trans, LinearRegression())
linereqpipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [31]:
predictions = linereqpipe.predict(X_test)
predictions = (predictions >= 0.5).astype(int)



In [32]:
linereqpipe.fit(X, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [33]:
X_test_kaggle = test_df.drop(['PassengerId'], axis=1)
predictions = linereqpipe.predict(X_test_kaggle)
predictions = (predictions >= 0.5).astype(int)



In [34]:
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": predictions
})

In [35]:
submission.to_csv("submission.csv", index=False)