# **Exploratory Data Analysis (EDA) Without Visualization**

## **Usual Data Handling**

In [None]:
## **1️⃣ Importing Necessary Libraries**
import pandas as pd  
import numpy as np  
import seaborn as sns  
from scipy.stats import zscore  

## **2️⃣ Loading the Titanic Dataset**
# Using seaborn's built-in Titanic dataset  
df = sns.load_dataset('titanic')  

## **3️⃣ Basic Dataset Information**
# Display first 5 rows  
print("First 5 Rows:\n", df.head())  

# Display last 5 rows  
print("Last 5 Rows:\n", df.tail())  

# Dataset shape  
print("Dataset Shape:", df.shape)  

# Column names and data types  
print("Column Information:\n")  
df.info()  

## **4️⃣ Handling Missing Values**
# Checking for missing values  
print("\nMissing Values Per Column:\n", df.isnull().sum())  

# Filling missing values for numerical columns with median  
df['age'].fillna(df['age'].median(), inplace=True)  

# Filling missing values for categorical columns with mode  
df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)  

# Dropping columns with too many missing values  
df.drop(columns=['deck'], inplace=True)  

## **5️⃣ Handling Duplicates**
# Checking for duplicate rows  
print("\nTotal Duplicate Rows:", df.duplicated().sum())  

# Removing duplicates  
df = df.drop_duplicates() 

## **6️⃣ Summary Statistics**

In [2]:
# Displaying summary statistics for numerical columns  
print("\nSummary Statistics (Numerical):\n", df.describe())  

# Displaying summary statistics for categorical columns  
print("\nSummary Statistics (Categorical):\n", df.describe(include=['O']))  



Summary Statistics (Numerical):
          survived      pclass         age       sibsp       parch        fare
count  775.000000  775.000000  775.000000  775.000000  775.000000  775.000000
mean     0.412903    2.246452   29.581187    0.529032    0.420645   34.878403
std      0.492674    0.853574   13.766359    0.990326    0.840565   52.408474
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    1.000000   21.000000    0.000000    0.000000    8.050000
50%      0.000000    3.000000   28.000000    0.000000    0.000000   15.900000
75%      1.000000    3.000000   36.000000    1.000000    1.000000   34.197900
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200

Summary Statistics (Categorical):
          sex embarked  who  embark_town alive
count    775      773  775          775   775
unique     2        3    3            3     2
top     male        S  man  Southampton    no
freq     483      560  443          562   45

## **7️⃣ Feature Analysis**

In [3]:
# Checking unique values in categorical features  
categorical_features = ['sex', 'embark_town', 'class', 'who', 'alive']  
for col in categorical_features:  
    print(f"\nUnique Values in {col}:\n", df[col].value_counts())  

# Checking range and percentiles for numerical features  
numerical_features = ['age', 'fare', 'sibsp', 'parch']  
for col in numerical_features:  
    print(f"\n{col} Percentiles:\n", df[col].quantile([0.01, 0.25, 0.5, 0.75, 0.99]))  


Unique Values in sex:
 sex
male      483
female    292
Name: count, dtype: int64

Unique Values in embark_town:
 embark_town
Southampton    562
Cherbourg      155
Queenstown      58
Name: count, dtype: int64

Unique Values in class:
 class
Third     401
First     210
Second    164
Name: count, dtype: int64

Unique Values in who:
 who
man      443
woman    250
child     82
Name: count, dtype: int64

Unique Values in alive:
 alive
no     455
yes    320
Name: count, dtype: int64

age Percentiles:
 0.01     1.00
0.25    21.00
0.50    28.00
0.75    36.00
0.99    65.26
Name: age, dtype: float64

fare Percentiles:
 0.01      0.0000
0.25      8.0500
0.50     15.9000
0.75     34.1979
0.99    262.3750
Name: fare, dtype: float64

sibsp Percentiles:
 0.01    0.0
0.25    0.0
0.50    0.0
0.75    1.0
0.99    4.0
Name: sibsp, dtype: float64

parch Percentiles:
 0.01    0.0
0.25    0.0
0.50    0.0
0.75    1.0
0.99    4.0
Name: parch, dtype: float64


## **8️⃣ Feature Correlation Analysis**

In [10]:
import pandas as pd
import numpy as np

# Load dataset (assuming df is already loaded)
df_encoded = df.copy()  # Creating a copy to avoid modifying the original dataframe

# Identify categorical columns
categorical_columns = df_encoded.select_dtypes(include=['object', 'category']).columns

# Convert categorical columns to string before encoding
df_encoded[categorical_columns] = df_encoded[categorical_columns].astype(str)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df_encoded, columns=categorical_columns, drop_first=True)

# Convert boolean values (if any) to integers
df_encoded = df_encoded.astype(float)

# Display the correlation matrix
print("\nCorrelation Matrix:\n", df_encoded.corr())

# Finding highly correlated features (absolute correlation > 0.5)
correlation_matrix = df_encoded.corr().abs()
high_corr = correlation_matrix[correlation_matrix > 0.5]
print("\nHighly Correlated Features (Correlation > 0.5):\n", high_corr)



Correlation Matrix:
                          survived    pclass       age     sibsp     parch  \
survived                 1.000000 -0.331388 -0.078114 -0.037841  0.069864   
pclass                  -0.331388  1.000000 -0.342745  0.087050  0.038998   
age                     -0.078114 -0.342745  1.000000 -0.279316 -0.182697   
sibsp                   -0.037841  0.087050 -0.279316  1.000000  0.379535   
parch                    0.069864  0.038998 -0.182697  0.379535  1.000000   
fare                     0.247159 -0.554649  0.092503  0.133807  0.190823   
adult_male              -0.529158  0.069762  0.274454 -0.272326 -0.345630   
alone                   -0.176714  0.113778  0.190270 -0.607809 -0.569387   
sex_male                -0.516121  0.118507  0.093574 -0.095574 -0.235116   
embarked_Q              -0.039325  0.211009 -0.031798 -0.003388 -0.066534   
embarked_S              -0.135950  0.108069 -0.021726  0.063312  0.063260   
embarked_nan             0.060654 -0.074326  0.075495 

## **9️⃣ Outlier Detection**

In [11]:
# Using IQR (Interquartile Range) method  
for col in ['fare', 'age']:  
    Q1 = df[col].quantile(0.25)  
    Q3 = df[col].quantile(0.75)  
    IQR = Q3 - Q1  
    outliers_iqr = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]  
    print(f"\nOutliers Detected in {col} Using IQR:\n", outliers_iqr[[col]])  

# Using Z-score method  
df['fare_zscore'] = zscore(df['fare'])  
outliers_z = df[df['fare_zscore'].abs() > 3]  
print("\nOutliers Detected in Fare Using Z-score:\n", outliers_z[['fare', 'fare_zscore']])  

# Dropping the temporary Z-score column  
df.drop(columns=['fare_zscore'], inplace=True) 


Outliers Detected in fare Using IQR:
          fare
27   263.0000
31   146.5208
34    82.1708
52    76.7292
61    80.0000
..        ...
829   80.0000
835   83.1583
849   89.1042
856  164.8667
879   83.1583

[102 rows x 1 columns]

Outliers Detected in age Using IQR:
       age
33   66.0
54   65.0
94   59.0
96   71.0
116  70.5
170  61.0
232  59.0
252  62.0
275  63.0
280  65.0
326  61.0
366  60.0
438  64.0
456  65.0
483  63.0
493  71.0
545  64.0
570  62.0
587  60.0
625  61.0
630  80.0
672  70.0
684  60.0
694  60.0
745  70.0
829  62.0
851  74.0

Outliers Detected in Fare Using Z-score:
          fare  fare_zscore
27   263.0000     4.355573
88   263.0000     4.355573
118  247.5208     4.060025
258  512.3292     9.116066
299  247.5208     4.060025
311  262.3750     4.343639
341  263.0000     4.355573
377  211.5000     3.372273
380  227.5250     3.678241
438  263.0000     4.355573
527  221.7792     3.568535
557  227.5250     3.678241
679  512.3292     9.116066
689  211.3375     3.369170
700

## **🔟 Data Distribution Analysis**

In [12]:
# Checking skewness & kurtosis for numerical features  
for col in numerical_features:  
    print(f"\nSkewness of {col}: {df[col].skew()}")  
    print(f"Kurtosis of {col}: {df[col].kurt()}")  


Skewness of age: 0.44198678493510685
Kurtosis of age: 0.567323209153018

Skewness of fare: 4.549950352869661
Kurtosis of fare: 29.905898390901694

Skewness of sibsp: 3.036078087580425
Kurtosis of sibsp: 12.608665960050903

Skewness of parch: 2.6133474892883943
Kurtosis of parch: 8.837563410273624


## **1️⃣1️⃣ Feature Engineering**

In [14]:
# Creating a new feature: Family Size  
df['family_size'] = df['sibsp'] + df['parch'] + 1  

# Creating a new binary feature: Is Alone?  
df['is_alone'] = (df['family_size'] == 1).astype(int)  

# Checking new feature distributions  
print("\nNew Feature Summary:\n", df[['family_size', 'is_alone']].describe())  

## **🔚 Conclusion**
print("\nFinal Dataset Shape After EDA:", df.shape)  



New Feature Summary:
        family_size    is_alone
count   775.000000  775.000000
mean      1.949677    0.563871
std       1.522882    0.496224
min       1.000000    0.000000
25%       1.000000    0.000000
50%       1.000000    1.000000
75%       2.000000    1.000000
max      11.000000    1.000000

Final Dataset Shape After EDA: (775, 16)
