In [1]:
import pandas as pd
import seaborn as sns
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from sklearn.impute import SimpleImputer



   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [13]:
# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Display the first few rows of the dataset
print(titanic_data.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [14]:
# Display tail of the dataset
print("\nTail of the dataset:")
print(titanic_data.tail())


Tail of the dataset:
     survived  pclass     sex   age  sibsp  parch   fare embarked   class  \
886         0       2    male  27.0      0      0  13.00        S  Second   
887         1       1  female  19.0      0      0  30.00        S   First   
888         0       3  female   NaN      1      2  23.45        S   Third   
889         1       1    male  26.0      0      0  30.00        C   First   
890         0       3    male  32.0      0      0   7.75        Q   Third   

       who  adult_male deck  embark_town alive  alone  
886    man        True  NaN  Southampton    no   True  
887  woman       False    B  Southampton   yes   True  
888  woman       False  NaN  Southampton    no  False  
889    man        True    C    Cherbourg   yes   True  
890    man        True  NaN   Queenstown    no   True  


In [15]:
# Count of non-null values for each column
print("\nCount of Non-Null Values:")
print(titanic_data.info())


Count of Non-Null Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [16]:
from sklearn.impute import SimpleImputer

# Handling missing numerical data (age column)
numerical_features = ['age']
numerical_imputer = SimpleImputer(strategy='mean')
titanic_data[numerical_features] = numerical_imputer.fit_transform(titanic_data[numerical_features])

# Handling missing categorical data (embarked column)
categorical_features = ['embarked']
categorical_imputer = SimpleImputer(strategy='most_frequent')
titanic_data[categorical_features] = categorical_imputer.fit_transform(titanic_data[categorical_features])

# Handling missing categorical data (deck column)
titanic_data['deck'] = titanic_data['deck'].cat.add_categories('Unknown')
titanic_data['deck'] = titanic_data['deck'].fillna('Unknown')

# Handling missing categorical data (embark_town column)
titanic_data['embark_town'] = titanic_data['embark_town'].fillna('Unknown')

# Confirming that there are no more missing values
print("\nCount of Null Values after Imputation:")
print(titanic_data.isnull().sum())


Count of Null Values after Imputation:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64


In [17]:
print(titanic_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         891 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [18]:
# Statistical Analysis
print("Statistical Analysis:")
print(titanic_data.describe())

Statistical Analysis:
         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   13.002015    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   22.000000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   29.699118    0.000000    0.000000   14.454200
75%      1.000000    3.000000   35.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


The statistical analysis provides a summary of the key statistical measures for various numerical columns in the Titanic dataset. Here are some insights that can be drawn from the provided summary statistics:

1. **Survived:** The mean indicates that approximately 38% of the passengers survived. This suggests that the dataset is slightly imbalanced, with more non-survivors than survivors.

2. **Pclass (Passenger Class):** The mean value around 2.31 indicates that the majority of the passengers were in the second or third class. The standard deviation of 0.84 suggests a relatively wide distribution of passenger classes.

3. **Age:** The mean age of approximately 29.7 suggests that the passengers were generally young adults. The standard deviation of 13.0 indicates a moderate amount of variability in ages. The minimum age of 0.42 suggests the presence of infants among the passengers.

4. **Sibsp (Number of Siblings/Spouses Aboard):** The mean value of approximately 0.52 suggests that most passengers did not have siblings or spouses aboard. The maximum value of 8 indicates the presence of a few passengers with a large number of siblings or spouses.

5. **Parch (Number of Parents/Children Aboard):** The mean value of 0.38 suggests that most passengers did not have parents or children aboard. The maximum value of 6 indicates the presence of a few passengers with a relatively large number of parents or children.

6. **Fare:** The mean fare of approximately 32.20 suggests a moderately high fare on average. The wide standard deviation of 49.69 indicates a large variation in ticket prices, likely due to different passenger classes and other factors.


In [19]:
# T-test
print("\nT-test for Fare between Survived and Not Survived:")
survived_fare = titanic_data[titanic_data['survived'] == 1]['fare']
not_survived_fare = titanic_data[titanic_data['survived'] == 0]['fare']
t_stat, p_val = stats.ttest_ind(survived_fare, not_survived_fare, nan_policy='omit')
print("T-statistic:", t_stat)
print("P-value:", p_val)


T-test for Fare between Survived and Not Survived:
T-statistic: 7.939191660871055
P-value: 6.120189341924198e-15


- T-statistic: The T-statistic value of 7.94 indicates the magnitude of the difference between the means of the fare prices paid by the two groups. A higher absolute T-statistic value suggests a greater difference between the means.

- P-value: The very low P-value of 6.12e-15 suggests strong evidence against the null hypothesis, indicating that the difference in the means of the fare prices between the two groups is unlikely to have occurred due to random chance. In this case, the low P-value suggests that the difference is statistically significant.

Overall, the results of the t-test indicate that there is a significant association between the fare prices and the survival status of the passengers. This suggests that the fare prices may have played a role in determining the survival outcome of the passengers aboard the Titanic.

In [20]:
# Chi-squared test
print("\nChi-squared test for Survival and Class:")
contingency_table = pd.crosstab(titanic_data['survived'], titanic_data['class'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print("Chi-squared statistic:", chi2)
print("P-value:", p)


Chi-squared test for Survival and Class:
Chi-squared statistic: 102.88898875696056
P-value: 4.549251711298793e-23


- Chi-squared statistic: The chi-squared statistic value of 102.89 represents the magnitude of the discrepancy between the expected and observed frequencies of survival and passenger class. A higher chi-squared value indicates a more substantial difference between the observed and expected frequencies.

- P-value: The very low P-value of 4.55e-23 suggests strong evidence against the null hypothesis, indicating that the observed association between survival and class is unlikely to have occurred due to chance. The low P-value suggests that there is a significant relationship between survival and class.

Based on these results, we can conclude that there is a strong association between the survival status of the passengers and the class they were traveling in. This suggests that the passenger class is a significant factor in determining the survival outcome during the Titanic disaster

In [21]:
# Proportions test for gender and survival
survived_sex = titanic_data.groupby('sex')['survived'].sum()
total_sex = titanic_data['sex'].value_counts()
stat, p_val = proportions_ztest(survived_sex, total_sex)
print("\nProportions Test for Gender and Survival:")
print("Z-statistic:", stat)
print("P-value:", p_val)


Proportions Test for Gender and Survival:
Z-statistic: 1.6619388877318888
P-value: 0.09652502845347238


- Z-statistic: The Z-statistic value of 1.66 measures the number of standard deviations that the proportion of survivors for males or females is away from the mean proportion. Since the Z-statistic is relatively close to 0, it suggests that the difference between the proportions of male and female survivors is not substantial.

- P-value: The P-value of 0.0965 suggests that the observed difference in the proportions of male and female survivors is not statistically significant at a conventional significance level (such as 0.05). Since the P-value is greater than 0.05, we fail to reject the null hypothesis that there is no difference in the proportions of male and female survivors.

In summary, the results indicate that there is no strong evidence to suggest a significant difference in the survival proportions between males and females. This implies that gender may not be a major determining factor for survival in the context of the Titanic dataset.

In [22]:
# Correlation test for age and fare
from scipy import stats 
print("\nCorrelation Test for Age and Fare:")
age_fare_corr, p_val_corr = stats.pearsonr(titanic_data['age'], titanic_data['fare'])
print("Correlation Coefficient:", age_fare_corr)
print("P-value:", p_val_corr)


Correlation Test for Age and Fare:
Correlation Coefficient: 0.09156609328505762
P-value: 0.006235676128297275


- Correlation Coefficient: The correlation coefficient of 0.0916 suggests a very weak positive linear relationship between the age and fare variables. This indicates that as one variable increases, the other variable also tends to increase slightly, but the relationship is not strong.

- P-value: The P-value of 0.0062 suggests that the observed correlation between age and fare is statistically significant. Since the P-value is less than the commonly used significance level of 0.05, we can reject the null hypothesis and conclude that there is a significant correlation between age and fare.

In summary, while there is a statistically significant correlation between age and fare, the correlation coefficient indicates that the relationship is very weak. This suggests that there is only a minimal tendency for the age and fare to increase together within the context of the Titanic dataset.

In [23]:
# One-way ANOVA test for age and class
print("\nOne-way ANOVA Test for Age and Class:")
class_groups = [titanic_data[titanic_data['class'] == i]['age'].dropna() for i in titanic_data['class'].unique()]
f_stat, p_val_anova = stats.f_oneway(*class_groups)
print("F-statistic:", f_stat)
print("P-value:", p_val_anova)



One-way ANOVA Test for Age and Class:
F-statistic: 56.57438528337169
P-value: 7.481182472787865e-24


- F-statistic: The F-statistic value of 56.57 represents the ratio of the variance between the group means to the variance within the groups. A higher F-statistic value suggests a greater difference between the group means relative to the variation within each group.

- P-value: The very low P-value of 7.48e-24 suggests strong evidence against the null hypothesis, indicating that the differences in the means of the age variable between the different passenger classes are unlikely to have occurred due to chance. The low P-value indicates that there is a significant difference in the mean ages across the passenger classes.

In summary, the results suggest that there is a statistically significant difference in the mean ages of passengers among different passenger classes. This implies that the passenger class has an influence on the ages of the passengers aboard the Titanic.

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
# Preprocessing
label_encoder = LabelEncoder()
titanic_data['sex'] = label_encoder.fit_transform(titanic_data['sex'])
titanic_data['embarked'] = titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0])
titanic_data['embarked'] = label_encoder.fit_transform(titanic_data['embarked'])
X = titanic_data[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
y = titanic_data['survived']
X = SimpleImputer(strategy='mean').fit_transform(X)

In [26]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and training the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [27]:
# Predicting the test set
y_pred = model.predict(X_test)

# Evaluating the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[90 15]
 [19 55]]


**Classification Report:**

- Precision: The precision values for the '0' and '1' classes indicate the proportion of true positive predictions for each class. In this case, the precision values are 0.83 and 0.79, respectively, suggesting that the model is relatively precise in its predictions for both classes.

- Recall: The recall values for the '0' and '1' classes represent the proportion of actual positives that were correctly identified by the model. The recall values of 0.86 and 0.74 suggest that the model performs well in capturing both classes, with a slightly higher recall for class '0' compared to class '1'.

- F1-score: The F1-score combines precision and recall into a single metric, providing a balance between the two. The F1-scores of 0.84 and 0.76 suggest that the model has a good balance between precision and recall for both classes.
Accuracy: The overall accuracy of 0.81 indicates the proportion of correctly classified instances out of the total instances. An accuracy of 81% suggests that the model performs reasonably well in predicting the outcomes for the given dataset.

- Confusion Matrix:
The confusion matrix provides a more detailed breakdown of the model's performance, showing the counts of true positive, false positive, true negative, and false negative predictions. In this case, the confusion matrix shows that the model correctly predicted 90 instances of class '0' (survived) and 55 instances of class '1' (not survived). It also misclassified 15 instances of class '0' as class '1' and 19 instances of class '1' as class '0'.

Overall, the classification report and confusion matrix suggest that the model performs reasonably well in predicting the survival status of the passengers, with a relatively balanced performance in terms of precision, recall, and accuracy.