In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt  # For making charts
import seaborn as sns 
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
train_df.head()


In [None]:
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df.head()


In [None]:
df = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
df.head()

# Dataset Overview

In [None]:
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

In [None]:
print(train_df.info())

In [None]:
print(train_df.dtypes)

In [None]:
print("\nMissing data check:")
missing_data = train_df.isnull().sum()
print(missing_data[missing_data > 0])

# Charts

In [None]:
plt.figure(figsize=(15, 10))

# Chart 1: Survival by gender
plt.subplot(2, 3, 1)
survival_by_sex = train_df.groupby('Sex')['Survived'].mean()
survival_by_sex.plot(kind='bar', color=['lightcoral', 'skyblue'])
plt.title('Survival Rate by Gender')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)

for i, v in enumerate(survival_by_sex):
    plt.text(i, v + 0.02, f'{v:.1%}', ha='center')

# Chart 2: Survival by class
plt.subplot(2, 3, 2)
survival_by_class = train_df.groupby('Pclass')['Survived'].mean()
survival_by_class.plot(kind='bar', color=['gold', 'silver', 'brown'])
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.xlabel('Class (1=First, 2=Second, 3=Third)')

for i, v in enumerate(survival_by_class):
    plt.text(i, v + 0.02, f'{v:.1%}', ha='center')

# Chart 3: Age distribution
plt.subplot(2, 3, 3)
plt.hist(train_df['Age'].dropna(), bins=20, alpha=0.7, color='lightgreen')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Number of Passengers')

# Chart 4: Simple survival pie chart
plt.subplot(2, 3, 4)
survival_counts = train_df['Survived'].value_counts()
plt.pie(survival_counts, labels=['Died', 'Survived'], autopct='%1.1f%%', 
        colors=['lightcoral', 'lightblue'])
plt.title('Overall Survival')

# Chart 5: Fare distribution
plt.subplot(2, 3, 5)
plt.hist(train_df['Fare'].dropna(), bins=20, alpha=0.7, color='orange')
plt.title('Ticket Fare Distribution')
plt.xlabel('Fare ($)')
plt.ylabel('Number of Passengers')

# Chart 6: Family size effect
plt.subplot(2, 3, 6)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
family_survival = train_df.groupby('FamilySize')['Survived'].mean()
family_survival.plot(kind='bar', color='purple', alpha=0.7)
plt.title('Survival Rate by Family Size')
plt.xlabel('Family Size')
plt.ylabel('Survival Rate')

plt.tight_layout()
plt.show()

# Key Findings

In [None]:
print(f"1. Women survived much more than men:")
for sex in ['female', 'male']:
    rate = train_df[train_df['Sex'] == sex]['Survived'].mean()
    print(f"   - {sex.title()}: {rate:.1%}")

print(f"\n2. Higher class passengers survived more:")
for pclass in [1, 2, 3]:
    rate = train_df[train_df['Pclass'] == pclass]['Survived'].mean()
    print(f"   - Class {pclass}: {rate:.1%}")

# Data Preparation

In [None]:
def prepare_data(data):
    """
    Clean and prepare data for our model
    """
    df = data.copy()
    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    df['Embarked'] = df['Embarked'].fillna('S')
    
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return df

# Cleaning data
train_clean = prepare_data(train_df)
test_clean = prepare_data(test_df)

print("Data cleaned successfully!")

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize']

X = train_clean[features]  # Features (input)
y = train_clean['Survived']  # Target (what we want to predict)

print(f"\nUsing {len(features)} features: {features}")
print(f"Training on {len(X)} passengers")

# Split training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Data (Random Forest [Predictive Analysis])

In [None]:
# Random Forest since its ez
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train
model.fit(X_train, y_train)

# Test
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model trained successfully!")
print(f"Accuracy on test data: {accuracy:.1%}")

# Important

In [None]:
importance = model.feature_importances_
feature_importance = list(zip(features, importance))
feature_importance.sort(key=lambda x: x[1], reverse=True)

for feature, imp in feature_importance:
    print(f"  {feature}: {imp:.3f}")

plt.figure(figsize=(14, 6))
features_sorted = [x[0] for x in feature_importance]
importance_sorted = [x[1] for x in feature_importance]
plt.bar(features_sorted, importance_sorted, color='skyblue')
plt.title('Which Features Are Most Important?')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Make predictions on the test data

In [None]:
X_test_final = test_clean[features]

# Make predictions
final_predictions = model.predict(X_test_final)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': final_predictions
})

print(f"-----Prediction results-----")
print(f"  - Predicted survivors: {final_predictions.sum()}")
print(f"  - Predicted deaths: {len(final_predictions) - final_predictions.sum()}")
print(f"  - Survival rate: {final_predictions.mean():.1%}")

In [None]:
submission.to_csv('submission.csv', index=False)
print(f"\nPredictions saved to 'my_titanic_predictions.csv'")

# Show first 10 predictions
print("\n👀 First 10 predictions:")
print(submission.head(10))

In [None]:
print(f"\n-------------Accomplishments----------------")
print(f"Loaded and explored {len(train_df)} passenger records")
print(f"Found that women had {train_df[train_df['Sex']=='female']['Survived'].mean():.1%} survival rate")
print(f"Found that first-class had {train_df[train_df['Pclass']==1]['Survived'].mean():.1%} survival rate")
print(f"Built a model with {accuracy:.1%} accuracy")
print(f"Made predictions for {len(test_df)} new passengers")