<a href="https://colab.research.google.com/github/GiX007/test_repo/blob/main/sample_nb1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Sample Titanic Data Analysis Notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the Titanic dataset
titanic = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# Display the first few rows
print(titanic.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
# Basic data exploration
print("Dataset shape:", titanic.shape)
print("\nData types:")
print(titanic.dtypes)

Dataset shape: (891, 12)

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [3]:
# Group by operation - Survival rate by passenger class
survival_by_class = titanic.groupby('Pclass')['Survived'].mean().reset_index()
print("\nSurvival rate by passenger class:")
print(survival_by_class)


Survival rate by passenger class:
   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [4]:
# Create a separate dataframe with age information
age_df = titanic[['PassengerId', 'Age', 'Sex']].copy()
age_df['AgeGroup'] = pd.cut(age_df['Age'], bins=[0, 12, 18, 35, 60, 100],
                           labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])

In [5]:
# Merge operation - Join age information back with main dataset
enhanced_titanic = titanic.merge(age_df[['PassengerId', 'AgeGroup']], on='PassengerId', how='left')
print("\nEnhanced dataset with age groups:")
print(enhanced_titanic.head())


Enhanced dataset with age groups:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked     AgeGroup  
0      0         A/5 21171   7.2500   NaN        S  Young Adult  
1      0          PC 17599  71.2833   C85        C        Adult  
2      0  STON/O2. 3101282   7.9250   NaN        S  Young Adult  
3      0            1

In [None]:
# Create a table with ticket fare statistics by passenger class and embarkation port
fare_stats = enhanced_titanic.pivot_table(
    values='Fare',
    index=['Pclass', 'Embarked'],
    columns=['Sex'],
    aggfunc=['mean', 'median', 'count']
)
print("\nFare statistics by class, port, and gender:")
print(fare_stats)


Fare statistics by class, port, and gender:
                       mean              median           count     
Sex                  female       male   female     male female male
Pclass Embarked                                                     
1      C         115.640309  93.536707  83.1583  61.6792     43   42
       Q          90.000000  90.000000  90.0000  90.0000      1    1
       S          99.026910  52.949947  79.6500  35.0000     48   79
2      C          25.268457  25.421250  24.0000  25.8604      7   10
       Q          12.350000  12.350000  12.3500  12.3500      2    1
       S          21.912687  19.232474  23.0000  13.0000     67   97
3      C          14.694926   9.352237  14.4583   7.2292     23   43
       Q          10.307833  11.924251   7.7500   7.7500     33   39
       S          18.670077  13.307149  14.4500   8.0500     88  265


In [None]:
# Reset MultiIndex before melting
fare_stats_reset = fare_stats.reset_index()

# FLATTEN multiindex columns into strings like "mean_male"
fare_stats_reset.columns = [
    '_'.join(map(str, col)).strip() if isinstance(col, tuple) else col
    for col in fare_stats_reset.columns
]

In [None]:
# Unpivot operation (melt) - Transform the pivot table back to long format
melted_fares = pd.melt(
    fare_stats_reset,
    id_vars=[('Pclass', ''), ('Embarked', '')],  # These are likely MultiIndex columns
    var_name='Metrics',
    value_name='Value'
)
print("\nMelted fare statistics:")
print(melted_fares.head())


Melted fare statistics:
   (Pclass, ) (Embarked, ) Metrics       Value
0           1            C    mean  115.640309
1           1            Q    mean   90.000000
2           1            S    mean   99.026910
3           2            C    mean   25.268457
4           2            Q    mean   12.350000


In [None]:
# Additional group by with multiple aggregation functions
survival_stats = titanic.groupby(['Sex', 'Pclass']).agg({
    'Survived': ['mean', 'count'],
    'Age': ['mean', 'median'],
    'Fare': ['mean', 'median']
}).reset_index()

print("\nComprehensive survival statistics:")
print(survival_stats)


Comprehensive survival statistics:
      Sex Pclass  Survived              Age               Fare          
                      mean count       mean median        mean    median
0  female      1  0.968085    94  34.611765   35.0  106.125798  82.66455
1  female      2  0.921053    76  28.722973   28.0   21.970121  22.00000
2  female      3  0.500000   144  21.750000   21.5   16.118810  12.47500
3    male      1  0.368852   122  41.281386   40.0   67.226127  41.26250
4    male      2  0.157407   108  30.740707   30.0   19.741782  13.00000
5    male      3  0.135447   347  26.507589   25.0   12.661633   7.92500
