In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
# 1. Load the .dta file
data = pd.read_stata("binary.dta") 
print(data.head())  

   admit    gre   gpa  rank
0    0.0  380.0  3.61   3.0
1    1.0  660.0  3.67   3.0
2    1.0  800.0  4.00   1.0
3    1.0  640.0  3.19   4.0
4    0.0  520.0  2.93   4.0


In [4]:
# 2. Explore the data
print("\n--- Data Info ---")
print(data.info())


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   400 non-null    float32
 1   gre     400 non-null    float32
 2   gpa     400 non-null    float32
 3   rank    400 non-null    float32
dtypes: float32(4)
memory usage: 6.4 KB
None


In [5]:
print("\n--- Value counts for admit ---")
print(data['admit'].value_counts())



--- Value counts for admit ---
admit
0.0    273
1.0    127
Name: count, dtype: int64


In [6]:
print("\n--- Mean GRE & GPA by Admission ---")
print(data.groupby('admit')[['gre', 'gpa']].mean())


--- Mean GRE & GPA by Admission ---
              gre       gpa
admit                      
0.0    573.186829  3.343700
1.0    618.897644  3.489213


In [7]:
print("\n--- Rank Frequency ---")
print(data['rank'].value_counts())


--- Rank Frequency ---
rank
2.0    151
3.0    121
4.0     67
1.0     61
Name: count, dtype: int64


In [12]:
print(data.columns)


Index(['admit', 'gre', 'gpa', 'rank_2.0', 'rank_3.0', 'rank_4.0'], dtype='object')


In [16]:
# Select predictors (they already exist in your data)
X = data[['gre', 'gpa', 'rank_2.0', 'rank_3.0', 'rank_4.0']]
X = sm.add_constant(X)  # add intercept
y = data['admit']

In [19]:
print(data.isnull().sum())


admit       0
gre         0
gpa         0
rank_2.0    0
rank_3.0    0
rank_4.0    0
dtype: int64


In [20]:
print(data.dtypes)


admit       float32
gre         float32
gpa         float32
rank_2.0       bool
rank_3.0       bool
rank_4.0       bool
dtype: object


In [21]:
# Convert boolean columns to int (0/1)
for col in ['rank_2.0', 'rank_3.0', 'rank_4.0']:
    data[col] = data[col].astype(int)

print(data.dtypes)  # verify they are now int


admit       float32
gre         float32
gpa         float32
rank_2.0      int32
rank_3.0      int32
rank_4.0      int32
dtype: object


In [22]:
X = data[['gre', 'gpa', 'rank_2.0', 'rank_3.0', 'rank_4.0']]
X = sm.add_constant(X)
y = data['admit']

model = sm.Logit(y, X)
result = model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Sat, 13 Sep 2025   Pseudo R-squ.:                 0.08292
Time:                        21:51:43   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.9900      1.140     -3.500      0.000      -6.224      -1.756
gre            0.0023      0.