# Guruprasad Velikadu Krishnamoorthy
# DSC530- Assignment Week 9

## Initial Setup

In [1]:
# Download basename and exists from OS module which will be used in the download function
from os.path import basename, exists

# Create a function named download_file, to dounload the scripts and files from Github to local path
def download_files(url):
    """
    Downloads the scripts/ files from Github to local directory
    takes url as input.
    """
    filename = basename(url)
    # Checking if the file exists in the local directory and it downloads the file if it doesn't exist already.
    if not exists(filename):
        from urllib.request import urlretrieve
        # Downloading the files to the local path
        local, _ = urlretrieve(url, filename)
        # Printing confirmation message
        print("Downloaded " + local)

In [2]:
#  Calling download functions to download .py files and data files used throughtout this assignment
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/brfss.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/CDBRFS08.ASC.gz")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/chap01soln.py")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dct")
download_files("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dat.gz")

In [3]:
# Importing the nsfg and other modules module from the author's code
import nsfg
import thinkstats2
import thinkplot

# importing the required libraries
import numpy as np
import sys
from collections import defaultdict
import math
import itertools
import pandas as pd
import scipy.stats as sci
import random
import matplotlib.pyplot as plt
import first
import brfss
import chap01soln
import statsmodels.formula.api as smf
import patsy


# Exercise 11.1

#### ***Question:*** Suppose one of your co-workers is expecting a baby and you are participating in an office pool to predict the date of birth. Assuming that bets are placed during the 30th week of pregnancy, what variables could you use to make the best prediction? You should limit yourself to variables that are known before the birth, and likely to be available to the people in the pool.

#### ***Approach:***  This problem can be approached in 2 ways. One is to perform Data Mining, by identifying the R2 values of all variables in the dataset and finding which variables have better Predictive power and choosing those that are known prior to the baby's birth. Other approach is to use the commonly known variables that are usually aware prior to the baby's birth such as race, baby's sex, if the mother is expecting twins or not, etc. Both the aproaches are demonstarted below.

In [4]:
# Creating dataframe of Live births from Pregnancy Datasets
live, firsts, others = first.MakeFrames()
# As the bets are placed only after 30 weeks of pregnancy, filtering the results.
live_births_30_df = live[live.prglngth>30]
# Creating respondents dataframe
respondent_df=chap01soln.ReadFemResp()
# Updating the index of the respondents dataframe as Caseid to have a common field while using joins.
respondent_df.index = respondent_df.caseid


In [5]:
# Join Dataframe is created by joining live births dataframe with respondents on the field "caseid"
join=live_births_30_df.join(respondent_df,on='caseid',rsuffix='_r')


In [6]:
# Approach 1: Using Data mining function to identify Explanatory variables
def create_Mining(join_df,fieldname):
    """Searches for each variable in the dataframe and performs ordinary least squares test.
    join_df: Source dataframe in which each variables is tested.
    fieldname: The dependent variable.
    returns: list of (rsquared, variable name) pairs
    """
    # creating a list to store all the R2 results of each variable
    all_fields_R2 = []
    # Iterate through each column in the source dataframe
    for name in join_df.columns:
        # Using try and except clause to catch exception while calculating variance on non-numeric fields
        try:
            # Excluding columns that have very small variance as they may not be significant.
            if join_df[name].var() < 1e-7:
                continue
            # Creating formula used in regression. The fieldname is the dependent variable and "name" is
            # the explanatory variable. This will be iterated for each column in the dataframe.
            formula = fieldname +' ~ + ' + name
            # Using statsmodel.formula.api to calculate the ordinary least squares
            model = smf.ols(formula, data=join_df)
            # if number of observations is lesser than half the length of dataframe ignore the column. 
            # This check is essential to ignore the columns with many NaNs
            if model.nobs < len(join_df)/2:
                continue
            # This will store the results of the model in results variable
            results = model.fit()
        # To catch exceptions while performing variance on non-numeric columns
        except (ValueError, TypeError, patsy.PatsyError) as e:
            continue
        # After each loop the results of the R2 and the column names are added back to all_fields_R2 list
        all_fields_R2.append((results.rsquared, name))
    return all_fields_R2

In [7]:
# Call the Mining function of the join dataframe created in step above.
all_fields_R2 = create_Mining(join,'prglngth')
all_fields_R2[:10]

[(3.7341851767624945e-05, 'caseid'),
 (0.0006222414860667103, 'pregordr'),
 (0.002249389433799265, 'pregend1'),
 (0.004577565785532922, 'nbrnaliv'),
 (0.0004069995445705743, 'cmprgend'),
 (1.3844054916889448e-06, 'cmprgbeg'),
 (0.0016571319550151564, 'gestasun_m'),
 (0.0010513799087199516, 'gestasun_w'),
 (0.8062434116139242, 'wksgest'),
 (0.09562431989592668, 'mosgest')]

In [8]:
# Sort the results so that the top few columns with significant R2 will be considered for analysis
all_fields_R2.sort(reverse=True)
# Pring the top 50 results from all_fields_R2
for rsq,col in all_fields_R2[:50]:
    print(rsq,col)

1.0 prglngth
0.8062434116139242 wksgest
0.12445743148120214 totalwgt_lb
0.11977307804917103 birthwgt_lb
0.10372542204583346 lbw1
0.09562431989592668 mosgest
0.02205377579646839 prglngth_i
0.006050495268196232 canhaver
0.005817755299879046 datcon01_i
0.005546376136235764 con1mar1_i
0.004577565785532922 nbrnaliv
0.0031508022538595526 mar1con1_i
0.0024520248837114345 anynurse
0.002369183944666786 bfeedwks
0.002249389433799265 pregend1
0.002243627968107287 rmarout11_i
0.002243627968107287 marout11_i
0.002243627968107287 marcon11_i
0.0020431424422022726 cmlastlb_r
0.0020431424422022726 cmlastlb
0.002012483392750064 datend02_i
0.002012483392750064 datcon02_i
0.0019882867688749695 agecon02_i
0.0019681593242574236 fmarcon5_i
0.0019528072368810712 ageprg02_i
0.0018917527758620656 evuseint
0.0018768219030150801 paydu
0.0017911410401607597 marcon03_i
0.0017799984332470542 anymschp
0.0016571319550151564 gestasun_m
0.0016125210616476648 hlpmc
0.0016007072270040057 diabetes
0.00159237909653831 marou

#### The above results of Mining function returns list of top 50 variables that have better R2. Some of the fields that are known before baby's birth such as  EVUSEINT,paydu,diabetes etc..and compute a model. Other option is to create a model based on commonly known attributes

In [9]:
# Creating a model based on Birthorder, Number of babies that were born alive in previous births, baby sex and the race.
# For this example we assume the baby is the first for the mother, and the race is Black, baby is a boy
model_ols_prglen = smf.ols('prglngth ~  nbrnaliv>1 + birthord==1+ babysex==1 + race==1 ', data=live)
# The results are then fit and summary method is called to describe the statistics
results_ols_prglen = model_ols_prglen.fit()
results_ols_prglen.summary()

0,1,2,3
Dep. Variable:,prglngth,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,58.76
Date:,"Sun, 14 May 2023",Prob (F-statistic):,4.66e-49
Time:,00:07:22,Log-Likelihood:,-21958.0
No. Observations:,9148,AIC:,43930.0
Df Residuals:,9143,BIC:,43960.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.6606,0.052,749.822,0.000,38.560,38.762
nbrnaliv > 1[T.True],-2.8989,0.211,-13.729,0.000,-3.313,-2.485
birthord == 1[T.True],0.0298,0.056,0.533,0.594,-0.080,0.139
babysex == 1[T.True],0.0861,0.056,1.543,0.123,-0.023,0.196
race == 1[T.True],-0.3978,0.063,-6.305,0.000,-0.521,-0.274

0,1,2,3
Omnibus:,5991.682,Durbin-Watson:,1.652
Prob(Omnibus):,0.0,Jarque-Bera (JB):,133303.027
Skew:,-2.799,Prob(JB):,0.0
Kurtosis:,20.843,Cond. No.,9.74


In [10]:
results_ols_prglen.params,results_ols_prglen.pvalues

(Intercept                38.660596
 nbrnaliv > 1[T.True]     -2.898877
 birthord == 1[T.True]     0.029797
 babysex == 1[T.True]      0.086136
 race == 1[T.True]        -0.397822
 dtype: float64,
 Intercept                0.000000e+00
 nbrnaliv > 1[T.True]     1.799105e-42
 birthord == 1[T.True]    5.943270e-01
 babysex == 1[T.True]     1.228084e-01
 race == 1[T.True]        3.009201e-10
 dtype: float64)

#### The results indicate that the numbers of babies that were born alive in previous pregnancies have significant effect in reducing the pregnancy length and the pvalue is also very less, so it indicates the variable is staitically significant. The first babies(birthord == 1) have a slightly longer pregnancy period than others, though the P- value (0.598) indicates the it is not significant. The Boy babies(babysex == 1) have slightly longer pregnancy period compared to girl babies and as the p value is 0.134, it is not statistically significant either. Black mothers have shorter pregnancy lengths and as the pvalue is lesser, this is a statistically significant variable. 

# Exercise 11.3

#### ***Question:***  If the quantity you want to predict is a count, you can use Poisson regression, which is implemented in StatsModels with a function called poisson. It works the same way as ols and logit. As an exercise, let’s use it to predict how many children a woman has born; in the NSFG dataset, this variable is called numbabes. 

#### Suppose you meet a woman who is 35 years old, black, and a college graduate whose annual household income exceeds $75,000. How many children would you predict she has born?

#### ***Solution:*** The approach to this problem is also similar to the above question. There are two ways to answer this. One by creating a list of all variables and their R2 and creating a model based on the columns with hightest R2. Other option is to create a model with the known variables such as Age, race, education, slaaray Range etc., Both the approaches are discussed here.

In [11]:
# Identifying the outliers in the field numbabes
join['numbabes'].max()

22

In [12]:
# Replacing the outliers with NaN
join.numbabes.replace([22], np.nan, inplace=True)
# As the age can be non-linear variable, creating square of ages can help create the model
join['age2'] = join.age_r**2
join['age3']= join.age_r**3

  join['age2'] = join.age_r**2
  join['age3']= join.age_r**3


In [13]:
# Use Data Mining approach to create a model to predict the numbabes 
all_fields_R2_numbabes = create_Mining(join,'numbabes')
all_fields_R2_numbabes[:10]

[(0.002094049363023265, 'caseid'),
 (0.2126685694414604, 'pregordr'),
 (0.010301308649175445, 'pregend1'),
 (0.0485323048098919, 'nbrnaliv'),
 (0.0006264834980079792, 'cmprgend'),
 (0.010976328312260142, 'cmprgbeg'),
 (0.0041344786969761405, 'gestasun_m'),
 (0.00397001747726311, 'gestasun_w'),
 (6.191929514709482e-05, 'wksgest'),
 (1.4799411245713934e-05, 'mosgest')]

In [14]:
# Sorting the results and dispalying to 50 variables with maximum R2 that may have significant impact 
all_fields_R2_numbabes.sort(reverse=True)
for rsq,col in all_fields_R2_numbabes[:50]:
    print(rsq,col)

1.0 parity_r
1.0 parity
1.0 numbabes
0.920721850685931 lbpregs
0.5461222994397412 compreg
0.5342918942857049 pregnum_r
0.5342918942857049 pregnum
0.5342918942857049 numpregs
0.37122629844249655 birthord
0.31366615410766474 numkdhh
0.25290204198545585 cebow
0.2476352290937649 roscnt
0.24344360236751028 numfmhh
0.2126685694414604 pregordr
0.18682409144149226 datbaby1
0.16950459759463576 rostscrn
0.168011863579271 datcon02
0.15381175360648724 datend02
0.15015017795889996 datcon01
0.14701692445380388 datend01
0.13530117422193955 datcon03
0.12150937969315934 agecon02
0.11978428528381302 cebowc
0.11536124557995309 ageprg02
0.11493541668576879 datend03
0.11241023551870588 agebaby1
0.10420369432666021 poverty_r
0.10420369432666021 poverty
0.09260567919420848 marcon06_i
0.09053640652157036 agecon03
0.08962556183024595 marout06_i
0.0874497755962077 agecon01
0.08653002411665633 marout05_i
0.08544431585860146 marcon05_i
0.0847456958667131 ageprg01
0.08184847437703191 ageprg03
0.0818191270737767 rm

In [15]:
# Creating a model based on the results above. Choosing the columns that have higher R2 cvalues
formula_poisson_1 = 'numbabes ~ C(lbpregs)  + C(compreg) + pregnum + birthord'
model_poisson_1 = smf.poisson(formula_poisson_1, data=join)
results_poisson_1 = model_poisson_1.fit()
# Printing the summary
results_poisson_1.summary() 

Optimization terminated successfully.
         Current function value: 1.407261
         Iterations 7


0,1,2,3
Dep. Variable:,numbabes,No. Observations:,8879.0
Model:,Poisson,Df Residuals:,8853.0
Method:,MLE,Df Model:,25.0
Date:,"Sun, 14 May 2023",Pseudo R-squ.:,0.1845
Time:,00:07:36,Log-Likelihood:,-12495.0
converged:,True,LL-Null:,-15322.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0253,0.051,0.498,0.618,-0.074,0.125
C(lbpregs)[T.2.0],0.6991,0.043,16.179,0.000,0.614,0.784
C(lbpregs)[T.3.0],1.1027,0.046,24.154,0.000,1.013,1.192
C(lbpregs)[T.4.0],1.3922,0.049,28.623,0.000,1.297,1.488
C(lbpregs)[T.5.0],1.5890,0.054,29.423,0.000,1.483,1.695
C(lbpregs)[T.6.0],1.7704,0.061,29.220,0.000,1.652,1.889
C(lbpregs)[T.7.0],2.0341,0.068,29.919,0.000,1.901,2.167
C(lbpregs)[T.8.0],1.9988,0.084,23.785,0.000,1.834,2.163
C(lbpregs)[T.9.0],2.1043,0.134,15.731,0.000,1.842,2.366


#### The results of model above indicate that the variable lbpregs(number of pregnancies ending in live birth) has significant impact on number of babies and the p-value is also much lower. The value compreg( number of complegted pregnancies) has some impact but the p-values are much higher, so it may not be statistically significant. The number of pregnancy and the Order of birth also have similar obseravtions with higher p-values

In [16]:
# Other approach to this problem is by computing a model based on parameters known 
# such as age, race, salary, education level .
formula_poisson_2 = 'numbabes ~ age_r + age2  + C(race) + totincr + educat'
# Creating a model based on the above variables and printing the summary
model_poisson_2 = smf.poisson(formula_poisson_2, data=join)
results_poisson_2 = model_poisson_2.fit()
results_poisson_2.summary() 

Optimization terminated successfully.
         Current function value: 1.663274
         Iterations 7


0,1,2,3
Dep. Variable:,numbabes,No. Observations:,8879.0
Model:,Poisson,Df Residuals:,8872.0
Method:,MLE,Df Model:,6.0
Date:,"Sun, 14 May 2023",Pseudo R-squ.:,0.03617
Time:,00:07:36,Log-Likelihood:,-14768.0
converged:,True,LL-Null:,-15322.0
Covariance Type:,nonrobust,LLR p-value:,3.01e-236

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.9918,0.169,-5.859,0.000,-1.324,-0.660
C(race)[T.2],-0.1492,0.015,-10.078,0.000,-0.178,-0.120
C(race)[T.3],-0.1007,0.025,-4.094,0.000,-0.149,-0.052
age_r,0.1535,0.010,14.803,0.000,0.133,0.174
age2,-0.0020,0.000,-12.953,0.000,-0.002,-0.002
totincr,-0.0171,0.002,-8.945,0.000,-0.021,-0.013
educat,-0.0478,0.003,-16.251,0.000,-0.054,-0.042


#### The results of the model indicate that the p values of all the values are very low, so all variables are statistically significant. White Mothers/ Mothers of other races may have lesser babies compared to black mothers. The older mothers may have a chance in having more babies. Interestingly, more educated and financially secure mothers may have born lesser babies compared to less educated mothers (which is sadly true).

In [17]:
# To answer the question about a woman who is 35 years old, black, and a college graduate 
# whose annual household income exceeds $75,000.
model_columns = ['age_r', 'age2', 'race', 'totincr', 'educat']
# The values of race, totincr and educate are derived from the codebook links below.
# https://www.cdc.gov/nchs/data/nsfg/codebooks/NSFG_2006-10_FemaleRecodes_Codebook.pdf
# https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf
women_35_df = pd.DataFrame([[35, 35**2, 1, 14, 16]], columns=model_columns)

results_poisson_2.predict(women_35_df)

0    2.517162
dtype: float64

#### The results indicate that the women 35 years old black, and a college graduate whose annual household income exceeds 75,000 may have birthed 3 kids(rounding off 2.517)

# Exercise 11.4

####  If the quantity you want to predict is categorical, you can use multinomial logistic regression, which is implemented in StatsModels with a function called mnlogit. As an exercise, let’s use it to guess whether a woman is married, cohabitating, widowed, divorced, separated, or never married; in the NSFG dataset, marital status is encoded in a variable called rmarital.

#### Suppose you meet a woman who is 25 years old, white, and a high school graduate whose annual household income is about $45,000. What is the probability that she is married, cohabitating, etc?

#### ***Solution:*** To approach the problem, we can use mnlogit method of StatsModels and for the variables required to predict the model, we can use the variables provided in the question such as age, race, salary amd education level. The other approach is to use the Mining function to identify variables with higher R2 values and build a model based on that.

In [18]:
# Creating a model based on known variables such as age, race, salary, education level
mnlogit_formula='rmarital ~ age_r + age2 + C(race) + totincr + educat'
mnlogit_model = smf.mnlogit(mnlogit_formula, data=join)
mnlogit_results = mnlogit_model.fit()
# Printing the results of the model
mnlogit_results.summary() 

Optimization terminated successfully.
         Current function value: 1.084053
         Iterations 8


0,1,2,3
Dep. Variable:,rmarital,No. Observations:,8884.0
Model:,MNLogit,Df Residuals:,8849.0
Method:,MLE,Df Model:,30.0
Date:,"Sun, 14 May 2023",Pseudo R-squ.:,0.1682
Time:,00:07:38,Log-Likelihood:,-9630.7
converged:,True,LL-Null:,-11579.0
Covariance Type:,nonrobust,LLR p-value:,0.0

rmarital=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,9.0156,0.805,11.199,0.000,7.438,10.593
C(race)[T.2],-0.9237,0.089,-10.418,0.000,-1.097,-0.750
C(race)[T.3],-0.6179,0.136,-4.536,0.000,-0.885,-0.351
age_r,-0.3635,0.051,-7.150,0.000,-0.463,-0.264
age2,0.0048,0.001,6.103,0.000,0.003,0.006
totincr,-0.1310,0.012,-11.337,0.000,-0.154,-0.108
educat,-0.1953,0.019,-10.424,0.000,-0.232,-0.159
rmarital=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.9570,3.020,0.979,0.328,-2.963,8.877
C(race)[T.2],-0.4411,0.237,-1.863,0.062,-0.905,0.023


In [19]:
# The results above displays results for each possible values of rmarital. Most of the variables have 
# p-values of negligible quantity while some are large.

# Findind the unique values and counts for rmartial field.
join['rmarital'].value_counts()

1    5027
6    1403
2     914
4     860
5     575
3     105
Name: rmarital, dtype: int64

In [20]:
# Creating the prediction model for the question about woman who is 25 years old, white,
# and a high school graduate whose annual household income is about $45,000
mnlogit_columns = ['age_r', 'age2', 'race', 'totincr', 'educat']
# The values of race, totincr and educate are derived from the codebook links below.
# https://www.cdc.gov/nchs/data/nsfg/codebooks/NSFG_2006-10_FemaleRecodes_Codebook.pdf
# https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf
mnlogit_df = pd.DataFrame([[25, 25**2, 2, 11, 12]], columns=mnlogit_columns)
mnlogit_results.predict(mnlogit_df)

Unnamed: 0,0,1,2,3,4,5
0,0.750028,0.126397,0.001564,0.033403,0.021485,0.067122


#### Per the Description of the rmarital field , the allowed values are from 1 to 6.
#### Description : Informal marital status
####  value label 
#### 1 CURRENTLY MARRIED 
#### 2 NOT MARRIED BUT LIVING WITH OPP SEX PARTNER 
#### 3 WIDOWED 
#### 4 DIVORCED 
#### 5 SEPARATED FOR REASONS OF MARITAL DISCORD 
#### 6 NEVER BEEN MARRIED 

***Results:*** Hence the values predicted are from 1 to 6, which says there is a 75% chance of probablity that the white women of 25 years age, maybe currently married and 13% probability that she is living with partner of opposite sex.

In [21]:
# Use Data Mining approach to create a model to predict the rmarital. However this may not help us to answer the a
# question of 25 year old women's status.
all_fields_R2_rmarital = create_Mining(join,'rmarital')
all_fields_R2_rmarital

  return 1 - self.ssr/self.centered_tss


[(5.8741534255668526e-05, 'caseid'),
 (0.0012479356045387302, 'pregordr'),
 (0.0010180388674351226, 'pregend1'),
 (0.0005588961830141903, 'nbrnaliv'),
 (1.6337956987300117e-08, 'cmprgend'),
 (0.0008560500415972783, 'cmprgbeg'),
 (0.0029634576465952245, 'gestasun_m'),
 (0.007146247485888324, 'gestasun_w'),
 (0.00041499767929198406, 'wksgest'),
 (0.00013426289623452714, 'mosgest'),
 (0.0006402417213253742, 'bpa_bdscheck1'),
 (0.0001328756681133747, 'babysex'),
 (0.004785866553673412, 'birthwgt_lb'),
 (3.3527558144119673e-05, 'birthwgt_oz'),
 (1.6337956987300117e-08, 'cmbabdob'),
 (0.0005464550898471865, 'kidage'),
 (0.03715116338989144, 'hpagelb'),
 (0.002796442596389226, 'matchfound'),
 (0.044384350352374646, 'anynurse'),
 (3.81113609870809e-06, 'frsteatd_n'),
 (0.0002501618955862428, 'frsteatd_p'),
 (1.6395022910620227e-05, 'frsteatd'),
 (2.3450933084312453e-06, 'cmlastlb'),
 (5.109615939247192e-07, 'cmfstprg'),
 (6.831200397350301e-06, 'cmlstprg'),
 (4.194265631796146e-05, 'cmintstr')

In [22]:
# Sorting the results and dispalying to 50 variables with maximum R2 that may have significant impact 
all_fields_R2_rmarital.sort(reverse=True)
for rsq,col in all_fields_R2_rmarital[:50]:
    print(rsq,col)

1.0 rmarital_r
1.0 rmarital
1.0 marstat
0.7444237805189576 fmarital_r
0.7444237805189576 fmarital
0.7444237805189576 fmarit
0.5056399923026791 manrel
0.41442292738387143 evrmarry
0.3697876840455987 currprts
0.29565106268586705 marout03
0.28450080910768893 rmarout03
0.277450645837343 fmarout5
0.2714215380870233 rmarout6
0.270410382120151 marcon03
0.24517774138050052 marout02
0.24111018495795977 pmarpreg
0.2326542069704104 rmarout02
0.23042366136633796 cohout
0.23013578223760134 marcon02
0.22678057673290863 fmarcon5
0.22554318723826883 fmarno
0.19964469618072245 totincr
0.19565642376426207 b1premar
0.1930986411423079 mar1bir1
0.18518418672413395 cebow
0.1715848030484406 evmarcoh
0.17062509517405755 marout01
0.17036179542254104 rmarout01
0.16533114273209548 pcurrntx
0.1491990781137842 liveoth
0.13489102935203112 marcon01
0.1333206760447324 prevcohb
0.13006655115570143 mar1con1
0.12520002380306272 foodstmp
0.12268865049754962 sex3mo
0.11826248991350186 nosex12
0.11144169392435599 currprtt


In [23]:
# Creating a model based on results from DataMining function and picking the variables with highest R2
# mnlogit_formula_2='rmarital ~ C(fmarital) +  C(cohout) + C(fmarno) + totincr '
mnlogit_formula_2='rmarital ~  totincr + C(cebow)+ C(sex3mo)'
mnlogit_model_2 = smf.mnlogit(mnlogit_formula_2, data=join)
mnlogit_results_2 = mnlogit_model_2.fit()
# Printing the results of the model
mnlogit_results_2.summary() 

         Current function value: 0.973725
         Iterations: 35




0,1,2,3
Dep. Variable:,rmarital,No. Observations:,8884.0
Model:,MNLogit,Df Residuals:,8824.0
Method:,MLE,Df Model:,55.0
Date:,"Sun, 14 May 2023",Pseudo R-squ.:,0.2529
Time:,00:07:55,Log-Likelihood:,-8650.6
converged:,False,LL-Null:,-11579.0
Covariance Type:,nonrobust,LLR p-value:,0.0

rmarital=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.3929,0.146,-16.389,0.000,-2.679,-2.107
C(cebow)[T.1.0],2.1400,0.123,17.368,0.000,1.899,2.382
C(cebow)[T.2.0],2.6253,0.134,19.579,0.000,2.362,2.888
C(cebow)[T.3.0],3.0044,0.146,20.559,0.000,2.718,3.291
C(cebow)[T.4.0],3.9003,0.194,20.131,0.000,3.521,4.280
C(cebow)[T.5.0],3.3337,0.282,11.831,0.000,2.781,3.886
C(cebow)[T.6.0],-11.6090,273.129,-0.043,0.966,-546.931,523.713
C(cebow)[T.7.0],3.5801,0.480,7.463,0.000,2.640,4.520
C(cebow)[T.8.0],55.0157,3.22e+07,1.71e-06,1.000,-6.32e+07,6.32e+07
C(cebow)[T.9.0],24.1662,1.27e+04,0.002,0.998,-2.49e+04,2.5e+04
