In [1]:
#import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

import re

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
df_model = pd.read_csv('Life_Expectancy_clean.csv')
df_model.head()

Unnamed: 0,Life_Expectancy,State_FIPS_Code,County_FIPS_Code,Premature_death_raw_value,Poor_or_fair_health_raw_value,Poor_physical_health_days_raw_value,Poor_mental_health_days_raw_value,Low_birthweight_raw_value,Adult_smoking_raw_value,Adult_obesity_raw_value,...,Median_household_income_raw_value,Children_eligible_for_free_or_reduced_price_lunch_raw_value,Residential_segregation___Black/White_raw_value,Residential_segregation___non_white/white_raw_value,Firearm_fatalities_raw_value,Homeownership_raw_value,Severe_housing_cost_burden_raw_value,Population_raw_value,County_Ranked_(Yes=1/No=0),Drinking_water_violations_raw_value
0,72.43875,-1.694237,-0.703354,2.334237,0.585167,0.972298,0.744619,1.185344,1.262168,0.600135,...,-0.110904,0.526556,0.039599,-0.593029,-0.08109,0.093544,0.719976,-0.353999,0.149924,-0.857549
1,70.426037,-1.694237,-0.686844,2.505048,2.389854,2.887774,1.915747,1.184023,2.51057,0.88249,...,-1.719216,1.152966,0.039599,1.602039,1.899777,0.173994,-0.089176,-0.334493,0.149924,1.166115
2,75.056297,-1.694237,-0.670334,0.548239,0.430316,0.944534,0.598795,0.020882,1.22046,-0.082221,...,-0.568212,0.575578,-0.68287,-0.215187,1.632092,0.787929,-0.730263,-0.304624,0.149924,-0.857549
3,77.644415,-1.694237,-0.653824,-0.211277,-0.007011,0.261949,0.311151,-0.090746,0.551838,0.811901,...,0.847341,-0.419564,0.904034,-0.25251,-0.009707,1.220457,-0.45301,-0.053936,0.149924,-0.857549
4,74.386212,-1.694237,-0.637314,1.031308,1.16896,1.455216,1.086015,0.018399,1.406349,-0.199869,...,-0.276522,0.254347,0.039599,0.807143,0.953958,0.142721,-0.782571,-0.334972,0.149924,-0.857549


In [6]:
# df_model.info()

# Create Initial Model

In [11]:
X = df_model.drop(columns=['Life_Expectancy'])
y = df_model['Life_Expectancy']

In [31]:
regression = LinearRegression()

crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
baseline = np.mean(cross_val_score(regression, X_scaled, y, scoring='r2', cv=crossvalidation))
print("Inital R^2", baseline)

Inital R^2 0.8987138087860259


# Interactions 

In [17]:
def feature_combinations_r_sqrd_with_Inter_df(X, y, num_feat_comb=2):
    # Requires cals: baseline & crossvalidation
    
    # Create Regression & Combinations
    from itertools import combinations
    combinations = list(combinations(list(X.columns), num_feat_comb))
    
    # Create cross-validation & output a bassline MSE score as a DataFrame
    comb_scores = []
    inter_cols = []
    inter_score = []
    data = X.copy()
    
    for comb in combinations:
        data['interaction'] = data[comb[0]] * data[comb[1]]
        score = np.mean(cross_val_score(regression, data, y, scoring='r2', cv=crossvalidation))
        if score > baseline: 
            comb_scores.append(round(score,3))
            inter_cols.append((str(comb[0]) + '_' + str(comb[1])))
            inter_score.append(data[comb[0]] * data[comb[1]])
    
    df_base = pd.DataFrame(data=[inter_cols, comb_scores])
    df_base = df_base.T  
    df_base.rename(columns={0: "Interaction", 1: "CV_score"}, inplace=True)
    df_base.sort_values(by='CV_score', inplace = True, ascending=False )
    df_base.reset_index(drop=True, inplace = True)
    
    df_interactions_scores = pd.DataFrame(data=inter_score , index=inter_cols)
    df_interactions_scores = df_interactions_scores.T
    

    return df_base , df_interactions_scores

In [19]:
df_base, df_score = feature_combinations_r_sqrd_with_Inter_df(X,y)

In [22]:
df_base.head(10)

Unnamed: 0,Interaction,CV_score
0,Premature_death_raw_value_Premature_age_adjust...,0.911
1,Population_raw_value_County_Ranked_(Yes=1/No=0),0.91
2,Injury_deaths_raw_value_Premature_age_adjusted...,0.905
3,Premature_death_raw_value_Income_inequality_ra...,0.905
4,Premature_death_raw_value_Poor_or_fair_health_...,0.905
5,Adult_smoking_raw_value_Premature_age_adjusted...,0.904
6,Premature_age_adjusted_mortality_raw_value_Fre...,0.904
7,Premature_death_raw_value_Frequent_physical_di...,0.904
8,Premature_death_raw_value_Frequent_mental_dist...,0.904
9,Premature_death_raw_value_Poor_physical_health...,0.904


In [23]:
def add_interaction_feature(data, df_inter, df_score, num_inter):
    i=0
    
    while i < num_inter:
        col = df_inter['Interaction'][i]
  
        data[col] = df_score[col]
        i+=1
    
    return data

In [26]:
data_combined = add_interaction_feature(df_model, df_base, df_score, 10)
data_combined.head()

Unnamed: 0,Life_Expectancy,State_FIPS_Code,County_FIPS_Code,Premature_death_raw_value,Poor_or_fair_health_raw_value,Poor_physical_health_days_raw_value,Poor_mental_health_days_raw_value,Low_birthweight_raw_value,Adult_smoking_raw_value,Adult_obesity_raw_value,...,Premature_death_raw_value_Premature_age_adjusted_mortality_raw_value,Population_raw_value_County_Ranked_(Yes=1/No=0),Injury_deaths_raw_value_Premature_age_adjusted_mortality_raw_value,Premature_death_raw_value_Income_inequality_raw_value,Premature_death_raw_value_Poor_or_fair_health_raw_value,Adult_smoking_raw_value_Premature_age_adjusted_mortality_raw_value,Premature_age_adjusted_mortality_raw_value_Frequent_physical_distress_raw_value,Premature_death_raw_value_Frequent_physical_distress_raw_value,Premature_death_raw_value_Frequent_mental_distress_raw_value,Premature_death_raw_value_Poor_physical_health_days_raw_value
0,72.43875,-1.694237,-0.703354,2.334237,0.585167,0.972298,0.744619,1.185344,1.262168,0.600135,...,4.090982,-0.053073,3.527093,3.868932,1.365919,2.212075,1.420833,1.892367,1.625232,2.269574
1,70.426037,-1.694237,-0.686844,2.505048,2.389854,2.887774,1.915747,1.184023,2.51057,0.88249,...,7.053895,-0.050149,7.024877,5.285655,5.9867,7.069443,8.771992,7.803714,6.07975,7.234014
2,75.056297,-1.694237,-0.670334,0.548239,0.430316,0.944534,0.598795,0.020882,1.22046,-0.082221,...,0.469177,-0.045671,0.997961,-0.121211,0.235916,1.044457,0.70803,0.453581,0.279898,0.51783
3,77.644415,-1.694237,-0.653824,-0.211277,-0.007011,0.261949,0.311151,-0.090746,0.551838,0.811901,...,0.041627,-0.008086,0.057991,0.239666,0.001481,-0.108726,0.015781,0.016923,0.035043,-0.055344
4,74.386212,-1.694237,-0.637314,1.031308,1.16896,1.455216,1.086015,0.018399,1.406349,-0.199869,...,1.100996,-0.050221,0.92163,0.413611,1.205558,1.50138,1.430514,1.381921,1.273628,1.500776


# Polynomial Parameters

In [28]:
# from sklearn.preprocessing import PolynomialFeatures

# regression = LinearRegression()
# crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)

# polynomials = []
# for col in X.columns:
#     for degree in [2, 3, 4]:
#         data = X.copy()
#         poly = PolynomialFeatures(degree, include_bias=False)
#         X_transformed = poly.fit_transform(X[[col]])
#         data = pd.concat([data.drop(col, axis=1),pd.DataFrame(X_transformed)], axis=1)
#         score = np.mean(cross_val_score(regression, data, y, scoring='r2', cv=crossvalidation))
#         if score > baseline: polynomials.append((col, degree, round(score, 3)))
# print('Top 10 polynomials: %s' %sorted(polynomials, key=lambda poly: poly[2], reverse=True)[:10])

Top 10 polynomials: [('Premature_death_raw_value', 2, 0.911), ('Premature_death_raw_value', 3, 0.911), ('Premature_death_raw_value', 4, 0.911), ('Premature_age_adjusted_mortality_raw_value', 3, 0.911), ('Premature_age_adjusted_mortality_raw_value', 4, 0.911), ('Premature_age_adjusted_mortality_raw_value', 2, 0.91), ('Mammography_screening_raw_value', 4, 0.902), ('Income_inequality_raw_value', 2, 0.902), ('Poor_or_fair_health_raw_value', 4, 0.901), ('Adult_smoking_raw_value', 2, 0.901)]


In [30]:
# polynom = pd.DataFrame(polynomials)
# polynom.groupby([0], sort=False)[2].max()

# Full model R-squared  

In [27]:
full_model = np.mean(cross_val_score(regression, df_base, y, scoring='r2', cv=crossvalidation))
print("Full model R^2:", full_model)

ValueError: Found input variables with inconsistent numbers of samples: [728, 2138]