# Model Training

In [3]:
#Installing Required Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


In [14]:
data=pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [15]:
# Step 2: Rename columns correctly
data.rename(columns={"Index": "ID", "Height Inches": "Height", "Weight Pounds": "Weight"}, inplace=True)

# Print updated column names
print(data.columns)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [19]:
#print(data.isnull().sum())
print(data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')


In [20]:
#Fill Missing Values in Age with the Median Age
data["Age"] = data["Age"].fillna(data["Age"].median())

#Fill Missing Values in Embarked with the Most Frequent Value
data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])


#Drop Rows Where Fare is Missing
data = data.dropna(subset=["Fare"])

print(data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [21]:
#Convert Data Types (if needed):
#If numerical columns are stored as strings, convert them:
data["Age"] = pd.to_numeric(data["Age"], errors="coerce")
data["Fare"] = pd.to_numeric(data["Fare"], errors="coerce")

In [22]:
data["Sex"] = data["Sex"].map({"male": 1, "female": 0})

In [24]:
data["Embarked"] = data["Embarked"].map({"C": 0, "Q": 1, "S": 2})


In [25]:
#Normalize "Fare" & "Age" (Optional)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[["Age", "Fare"]] = scaler.fit_transform(data[["Age", "Fare"]])

In [26]:
data.to_csv("titanic_cleaned.csv", index=False)
print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!


In [27]:
#Since we’re preparing data for a model, let’s select relevant columns:
features = ["Pclass", "Sex", "Age", "Fare", "Embarked"]
target = "Survived"

X = data[features]  # Independent variables
y = data[target]  # Dependent variable


In [28]:
#We need to train a model, so let's split the data:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split into training and testing sets successfully!")


Dataset split into training and testing sets successfully!


In [30]:
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values
imputer = SimpleImputer(strategy="most_frequent")  # Uses the most common value
X = imputer.fit_transform(X)

print("Missing values filled successfully!")

Missing values filled successfully!




In [33]:
from sklearn.metrics import accuracy_score
#We'll use Logistic Regression as it's simple for binary classification:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.79
