<a href="https://colab.research.google.com/github/JaynthReddy91/MachineLearning/blob/main/titanic_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("titanic_train.csv")

# 1. Constant Features
constant_features = [col for col in df.columns if df[col].nunique() == 1]
print("Constant Features:", constant_features)

# 2. Quasi-constant Features (threshold = 99%)
quasi_constant_features = []
for col in df.columns:
    top_freq = df[col].value_counts(normalize=True, dropna=False).values[0]
    if top_freq > 0.99:
        quasi_constant_features.append(col)
print("Quasi-constant Features:", quasi_constant_features)

# 3. Duplicate Features
duplicate_features = []
for i in range(len(df.columns)):
    for j in range(i+1, len(df.columns)):
        if df.iloc[:, i].equals(df.iloc[:, j]):
            duplicate_features.append(df.columns[j])
print("Duplicate Features:", duplicate_features)


Constant Features: []
Quasi-constant Features: []
Duplicate Features: []


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import chi2, f_classif, f_regression, SelectKBest
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("titanic_train.csv")

# Example target (Survived = classification, Age = regression)
y_class = df["Survived"]
y_reg = df["Age"].fillna(df["Age"].median())  # fill NA for regression
X = df.drop(columns=["Survived", "Age"])

# Convert categorical to numeric for statistical tests
X = pd.get_dummies(X, drop_first=True)

# ------------------------
# 1. Correlation
# ------------------------
corr_matrix = X.corr()
print("Correlation Matrix:\n", corr_matrix)

# ------------------------
# 2. Mutual Information
# ------------------------
mi_class = mutual_info_classif(X.fillna(0), y_class)
mi_reg = mutual_info_regression(X.fillna(0), y_reg)

print("Mutual Information (Classification):", dict(zip(X.columns, mi_class)))
print("Mutual Information (Regression):", dict(zip(X.columns, mi_reg)))

# ------------------------
# 3. Chi-Square (Classification, categorical data only)
# ------------------------
X_chi2 = X.fillna(0)
chi_scores, p_values = chi2(X_chi2, y_class)
print("Chi-Square Scores:", dict(zip(X.columns, chi_scores)))
print("Chi-Square p-values:", dict(zip(X.columns, p_values)))

# ------------------------
# 4. ANOVA / Univariate Feature Selection
# ------------------------
anova_class = SelectKBest(score_func=f_classif, k=5).fit(X.fillna(0), y_class)
anova_reg = SelectKBest(score_func=f_regression, k=5).fit(X.fillna(0), y_reg)

print("ANOVA (Classification) Scores:", dict(zip(X.columns, anova_class.scores_)))
print("ANOVA (Regression) Scores:", dict(zip(X.columns, anova_reg.scores_)))

# ------------------------
# 5. Univariate ROC-AUC (Classification) & MSE (Regression)
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(X.fillna(0), y_class, test_size=0.3, random_state=42)

roc_auc_scores = {}
for col in X.columns:
    try:
        score = roc_auc_score(y_test, X_test[col])
        roc_auc_scores[col] = score
    except:
        pass
print("Univariate ROC-AUC (Classification):", roc_auc_scores)

# Regression MSE
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X.fillna(0), y_reg, test_size=0.3, random_state=42)
mse_scores = {}
for col in X.columns:
    preds = X_test_r[col]
    score = mean_squared_error(y_test_r, preds)
    mse_scores[col] = score
print("Univariate MSE (Regression):", mse_scores)


Correlation Matrix:
              PassengerId    Pclass     SibSp     Parch      Fare  \
PassengerId     1.000000 -0.035144 -0.057527 -0.001652  0.012658   
Pclass         -0.035144  1.000000  0.083081  0.018443 -0.549500   
SibSp          -0.057527  0.083081  1.000000  0.414838  0.159651   
Parch          -0.001652  0.018443  0.414838  1.000000  0.216225   
Fare            0.012658 -0.549500  0.159651  0.216225  1.000000   
...                  ...       ...       ...       ...       ...   
Cabin_F4       -0.008206 -0.017519  0.063564  0.036410  0.006490   
Cabin_G6       -0.060049  0.055561 -0.001402  0.072388 -0.025180   
Cabin_T        -0.013814 -0.052496 -0.015907 -0.015878  0.002224   
Embarked_Q     -0.033606  0.221009 -0.026354 -0.081228 -0.117216   
Embarked_S      0.022148  0.081720  0.070941  0.063036 -0.166603   

             Name_Abbott, Mr. Rossmore Edward  \
PassengerId                          0.039227   
Pclass                               0.027734   
SibSp          

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE
from itertools import combinations
import numpy as np

# Load dataset
df = pd.read_csv("titanic_train.csv")

# Define features (X) and target (y)
X = pd.get_dummies(df.drop(columns=["Survived"]), drop_first=True).fillna(0)
y = df["Survived"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=500)

# -----------------------------------
# 1. Forward Selection
# -----------------------------------
sfs_forward = SFS(model,
                  k_features="best",
                  forward=True,
                  floating=False,
                  scoring="accuracy",
                  cv=5)
sfs_forward = sfs_forward.fit(X_train, y_train)
print("Forward Selection Features:", sfs_forward.k_feature_names_)

# -----------------------------------
# 2. Backward Selection
# -----------------------------------
sfs_backward = SFS(model,
                   k_features="best",
                   forward=False,
                   floating=False,
                   scoring="accuracy",
                   cv=5)
sfs_backward = sfs_backward.fit(X_train, y_train)
print("Backward Selection Features:", sfs_backward.k_feature_names_)

# -----------------------------------
# 3. Exhaustive Search (all possible combinations)
# -----------------------------------
sfs_exhaustive = SFS(model,
                     k_features=3,    # example: best 3 features
                     forward=True,
                     floating=False,
                     scoring="accuracy",
                     cv=5)
sfs_exhaustive = sfs_exhaustive.fit(X_train, y_train)
print("Exhaustive Search Best Features:", sfs_exhaustive.k_feature_names_)

# -----------------------------------
# 4. Recursive Feature Elimination (RFE)
# -----------------------------------
rfe = RFE(estimator=model, n_features_to_select=5)   # keep 5 best features
rfe = rfe.fit(X_train, y_train)

# i) Ranking Features
print("RFE Feature Ranking:", dict(zip(X.columns, rfe.ranking_)))

# ii) Selected Features
print("Selected Features by RFE:", X.columns[rfe.support_].tolist())

# iii) Stopping Criterion is defined by `n_features_to_select`


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("titanic_train.csv")

# Features and target
X = pd.get_dummies(df.drop(columns=["Survived"]), drop_first=True).fillna(0)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ------------------------------
# 1. L1 Regularization (LASSO)
# ------------------------------
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
lasso_coef = pd.Series(lasso.coef_, index=X.columns)
print("LASSO Selected Features:", list(lasso_coef[lasso_coef != 0].index))

# ------------------------------
# 2. L2 Regularization (Ridge)
# ------------------------------
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_coef = pd.Series(ridge.coef_, index=X.columns)
print("Ridge Feature Coefficients:\n", ridge_coef)

# ------------------------------
# 3. Elastic Net (L1 + L2)
# ------------------------------
elastic = ElasticNet(alpha=0.01, l1_ratio=0.5)
elastic.fit(X_train, y_train)
elastic_coef = pd.Series(elastic.coef_, index=X.columns)
print("Elastic Net Selected Features:", list(elastic_coef[elastic_coef != 0].index))

# ------------------------------
# 4. Decision Tree
# ------------------------------
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_importance = pd.Series(dt.feature_importances_, index=X.columns)
print("Decision Tree Important Features:", dt_importance.sort_values(ascending=False).head(10))

# ------------------------------
# 5. Random Forest
# ------------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_importance = pd.Series(rf.feature_importances_, index=X.columns)
print("Random Forest Important Features:", rf_importance.sort_values(ascending=False).head(10))

# ------------------------------
# 6. Support Vector Machine (SVM with L1 penalty)
# ------------------------------
svm = LogisticRegression(penalty="l1", solver="liblinear", max_iter=500)
svm.fit(X_train, y_train)
svm_coef = pd.Series(svm.coef_[0], index=X.columns)
print("SVM (L1) Selected Features:", list(svm_coef[svm_coef != 0].index))


LASSO Selected Features: ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Fare', 'Sex_male', 'Embarked_S']
Ridge Feature Coefficients:
 PassengerId    0.000024
Pclass        -0.134211
Age           -0.001835
SibSp         -0.012512
Parch         -0.008556
                 ...   
Cabin_F4       0.170569
Cabin_G6      -0.082044
Cabin_T       -0.080701
Embarked_Q    -0.042402
Embarked_S    -0.088150
Length: 1725, dtype: float64
Elastic Net Selected Features: ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_S']
Decision Tree Important Features: Sex_male             0.283660
Fare                 0.100795
Pclass               0.087120
PassengerId          0.034328
Age                  0.021430
SibSp                0.013234
Ticket_113781        0.013014
Ticket_C.A. 37671    0.010100
Cabin_E24            0.009988
Cabin_F2             0.009877
dtype: float64
Random Forest Important Features: Sex_male       0.132107
Fare           0.066437
PassengerId    0.049036
Pclass