In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

train

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
import statsmodels.api as sm
from matplotlib.pyplot import subplots

In [4]:
train.shape

(891, 12)

In [5]:
def create_new_x(df):
    new_df = df.copy()
    
    # Add a class for if age is known or not
    new_df["isAgeKnown"] = new_df["Age"].notnull().astype(int)

    # Fill missing ages with median
    age_q2 = new_df["Age"].quantile(0.5)
    new_df["Age"] = new_df["Age"].fillna(age_q2)
    
    # Group Parch
    def parch_group(x):
        if x == 0: return "Parch_0"
        elif x < 3: return "Parch_1to2"
        elif x <= 6: return "Parch_3to6"
        else: return "Parch_7plus"
    new_df["ParchGroup"] = new_df["Parch"].apply(parch_group)

    # Group SibSp
    def sibsp_group(x):
        if x == 0: return "Sibsp_0"
        elif x < 3: return "Sibsp_1to2"
        elif x <= 6: return "Sibsp_3to6"
        else: return "Sibsp_7plus"
    new_df["SibspGroup"] = new_df["SibSp"].apply(sibsp_group)

    # Group Age
    def age_group(x):
        if x < 12: return "Child"
        elif x < 20: return "Teen"
        elif x < 35: return "Adult"
        elif x < 50: return "MiddleAged"
        else: return "Senior"
    new_df["AgeGroup"] = new_df["Age"].apply(age_group)

    # Total family size feature
    new_df["TotSibParch"] = new_df["SibSp"] + new_df["Parch"]
    new_df["TotSibParch2"] = new_df["TotSibParch"] ** 2
    
    # Drop unused
    predictors = new_df.columns.drop(["PassengerId","Name","Ticket","Cabin","Age"])
    predictors = predictors.drop(["Parch","SibSp"])
    
    # Make some ints categorical
    categorical_ints = ["TotSibParch"]
    for col in categorical_ints:
        new_df[col] = new_df[col].astype("category")
    
    # Dummy encode
    X = pd.get_dummies(new_df[predictors], drop_first=True)
    X = sm.add_constant(X)
    
    return X



In [6]:
# Train set
X = create_new_x(train.drop("Survived", axis=1))
y = train["Survived"]

X = X.astype(float)
y = y.astype(float)

model = sm.OLS(y,X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.429
Model:                            OLS   Adj. R-squared:                  0.415
Method:                 Least Squares   F-statistic:                     29.67
Date:                Sat, 30 Aug 2025   Prob (F-statistic):           3.35e-90
Time:                        05:09:55   Log-Likelihood:                -372.17
No. Observations:                 891   AIC:                             790.3
Df Residuals:                     868   BIC:                             900.6
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      1

In [7]:
test_X = create_new_x(test)
test_X = test_X.reindex(columns=X.columns, fill_value=0)
test_predictions = results.predict(test_X)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": (test_predictions > 0.5).astype(int)
})
submission.to_csv("submission4.csv", index=False)

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Feature"] = X.columns
print(vif.sort_values("VIF", ascending=False))


          VIF                 Feature
12        inf  SibspGroup_Sibsp_7plus
17        inf           TotSibParch_1
23        inf           TotSibParch_7
22        inf           TotSibParch_6
21        inf           TotSibParch_5
20        inf           TotSibParch_4
19        inf           TotSibParch_3
18        inf           TotSibParch_2
24        inf          TotSibParch_10
4         inf            TotSibParch2
0   30.920937                   const
11  10.071319   SibspGroup_Sibsp_3to6
8    7.251330   ParchGroup_Parch_1to2
10   6.666780   SibspGroup_Sibsp_1to2
9    4.994214   ParchGroup_Parch_3to6
1    1.852900                  Pclass
13   1.802195          AgeGroup_Child
2    1.734043                    Fare
6    1.609998              Embarked_Q
7    1.514337              Embarked_S
3    1.476160              isAgeKnown
14   1.342198     AgeGroup_MiddleAged
15   1.257823         AgeGroup_Senior
16   1.197898           AgeGroup_Teen
5    1.193584                Sex_male


  vif = 1. / (1. - r_squared_i)
