In [89]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"D:\Naviotech-Internship\train.csv")

# Check the first rows and columns
print(df.head())
print(df.columns)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
In

In [90]:
print(df.Cabin)
print(df['Embarked'].isnull().sum())

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object
2


In [95]:
# Drop unused columns
df = df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1 , errors= 'ignore')

# Fill missing values in Age with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing values in Embarked with mode (most frequent value)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# If Fare has missing values, also fill
df['Fare'] = df['Fare'].fillna(df['Fare'].median())




In [96]:
df.Fare

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

In [97]:
# Convert categorical columns into numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode Embarked (C, Q, S)
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)


In [98]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,False,True
1,1,1,1,38.0,1,0,71.2833,False,False
2,1,3,1,26.0,0,0,7.925,False,True
3,1,1,1,35.0,1,0,53.1,False,True
4,0,3,0,35.0,0,0,8.05,False,True


In [99]:
X = df.drop("Survived", axis=1)  # features
y = df["Survived"]               # target


In [100]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [38]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [50]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [101]:
test_df = pd.read_csv(r"D:\Naviotech-Internship\test.csv")
test_df


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [102]:
if 'Embarked' in test_df.columns:
    # Fill missing values
    test_df['Embarked'] = test_df['Embarked'].fillna('S')
    
    # One-hot encode
    test_df = pd.get_dummies(test_df, columns=['Embarked'])
else:
    # If Embarked is missing, create the expected one-hot columns with 0
    test_df['Embarked_C'] = 0
    test_df['Embarked_Q'] = 0
    test_df['Embarked_S'] = 0

print(test_df.columns)    


Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [104]:
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, errors='ignore')
# Use the same medians as training set
test_df['Age'] = test_df['Age'].fillna(df['Age'].median())  # df = your training DataFrame
test_df['Fare'] = test_df['Fare'].fillna(df['Fare'].median())

test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# X_train = your training features after preprocessing
test_df = test_df.reindex(columns=X_train.columns, fill_value=0)
# Fill any remaining NaNs with 0 (safe for one-hot and numeric columns)
test_df = test_df.fillna(0)


predictions = model.predict(test_df)



In [105]:
test_df_original = test_df.copy()


In [106]:
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, errors='ignore')
# Fill missing values, encode Sex, align columns, etc.


In [108]:
submission = pd.DataFrame({
    "PassengerId": test_df_original.PassengerId,
    "Survived": predictions
})

submission.to_csv("titanic_predictions.csv", index=False)


AttributeError: 'DataFrame' object has no attribute 'PassengerId'

In [77]:
submission.head()   # see first few rows
# submission['Survived'].value_counts()  # check distribution


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
