In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr, chi2_contingency



In [None]:
#Load the dataset
path = '/mnt/data/train.csv'
data = pd.read_csv(path)



In [None]:
#Describe the dataset
desc = data.describe(include='all')
info = data.info()



In [None]:
#Check for null values
null_values = data.isnull().sum()
num_rows_with_null = data.isnull().any(axis=1).sum()
fraction_rows_with_null = (num_rows_with_null / len(data)) * 100



In [None]:
#Categorical variance analysis
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
filtered_cols = [col for col in categorical_cols if data[col].nunique() <= 10]
variances = {col: data.groupby(col)['Transported'].var() for col in filtered_cols}



In [None]:
#Numeric histograms
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('Age')
for col in numeric_cols:
    plt.hist(data[col], bins=30, edgecolor='k', alpha=0.65)
    plt.axvline(data[col].mean(), color='r', linestyle='dashed', linewidth=1)
    plt.title(col)
    plt.show()



In [None]:
#Filling null values in categorical variables
data[filtered_cols] = data[filtered_cols].fillna('Unknown')
data['Cabin'].fillna('Unknown', inplace=True)
data['Name'].fillna('Unknown', inplace=True)



In [None]:
#Deriving 'Deck', 'Number', and 'Side' from the 'Cabin' column
data['Deck'] = data['Cabin'].apply(lambda x: x[0] if x != 'Unknown' else 'unknown')
data['Number'] = data['Cabin'].apply(lambda x: x[1:-1] if x != 'Unknown' else 'unknown')
data['Side'] = data['Cabin'].apply(lambda x: x[-1] if x != 'Unknown' else 'unknown')



In [None]:
#Creating boolean variables for each numeric attribute (excluding 'Age')
for col in numeric_cols:
    data[f'{col}_used'] = (data[col] > 0).astype(int)
data['spent_money'] = data[[f'{col}_used' for col in numeric_cols]].sum(axis=1) > 0



In [None]:
#@Categorizing records based on age and filling null values in "Age" with its median
bins = [0, 26, 46, 61, np.inf]
labels = ['0-25', '26-45', '46-60', '60+']
data['Age_group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False).cat.add_categories(['Unknown'])
data['Age_group'].fillna('Unknown', inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)



In [None]:
#Determining the route of the trip
data['Route'] = data['HomePlanet'] + "-" + data['Destination']



In [None]:
#Drop specified columns
data.drop(columns=['Name', 'PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Number'], inplace=True)



In [None]:
#Transform numeric attributes using np.log1p (excluding 'Age')
for col in numeric_cols:
    data[col] = np.log1p(data[col])



In [None]:
#One-hot encode the categorical variables without dropping the first column
data_encoded = pd.get_dummies(data, drop_first=False, columns=data.select_dtypes(include=['object', 'bool', 'category']).columns)
