### 1 - Module Importieren

In [1]:
# Standard Module
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Statistische Module
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.stats.api as sms
import statsmodels.api as sm

### 2 - Daten Importieren

In [3]:
# CSV Datei importieren
df = pd.read_csv(os.getcwd() + "/insurance.csv")
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


### 3 - Daten Transformieren

**Unabhängige Variablen:**
- `age`       : Alter in Jahren
- `bmi`       : Body-Mass-Index
- `children`  : Anzahl Kinder
- `sex`       : Geschlecht (1 = Männlich; 0 = Weiblich)
- `smoker`    : Raucher (1 = Ja; 0 = Nein)
- `region_nw` : Person aus Region Nordwest (1 = Ja; 0 = Nein)
- `region_se` : Person aus Region Südost (1 = Ja; 0 = Nein)
- `region_sw` : Person aus Region Südwest (1 = Ja; 0 = Nein)
- `bmi30` : BMI >= 30 (1 = Ja; 0 = Nein)

**Abhängige Variable:**
- `charges`   : Versicherungsgebühren (USD)

In [4]:
# Kategoriale Daten in Dummy-Variablen überführen
dfc = pd.get_dummies(df, drop_first = True, dtype = "int64")
dfc = dfc.rename(columns = {"sex_male":"sex",
                            "smoker_yes":"smoker",
                            "region_northwest":"region_nw",
                            "region_southeast":"region_se",
                            "region_southwest":"region_sw"})

# Numerische Daten runden
dfc["bmi"] = dfc["bmi"].round(2)
dfc["charges"] = dfc["charges"].round(2)

In [5]:
# Merkmal "bmi30" dem Datensatz hinzufügen
dfc["bmi30"] = 0
dfc.loc[df["bmi"] >= 30, "bmi30"] = 1
dfc.head(10)

Unnamed: 0,age,bmi,children,charges,sex,smoker,region_nw,region_se,region_sw,bmi30
0,19,27.9,0,16884.92,0,1,0,0,1,0
1,18,33.77,1,1725.55,1,0,0,1,0,1
2,28,33.0,3,4449.46,1,0,0,1,0,1
3,33,22.7,0,21984.47,1,0,1,0,0,0
4,32,28.88,0,3866.86,1,0,1,0,0,0
5,31,25.74,0,3756.62,0,0,0,1,0,0
6,46,33.44,1,8240.59,0,0,0,1,0,1
7,37,27.74,3,7281.51,0,0,1,0,0,0
8,37,29.83,2,6406.41,1,0,0,0,0,0
9,60,25.84,0,28923.14,0,0,1,0,0,0


In [6]:
dfc.to_csv("insurance_transformed.csv", index = False)

### 4 - Deskriptive Statistiken & Analyse

In [7]:
# Allgemeine Informationen über Datensatz
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        1338 non-null   int64  
 1   bmi        1338 non-null   float64
 2   children   1338 non-null   int64  
 3   charges    1338 non-null   float64
 4   sex        1338 non-null   int64  
 5   smoker     1338 non-null   int64  
 6   region_nw  1338 non-null   int64  
 7   region_se  1338 non-null   int64  
 8   region_sw  1338 non-null   int64  
 9   bmi30      1338 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 104.7 KB


In [8]:
# Deskriptive Statistiken
dfc.describe().round(2)

Unnamed: 0,age,bmi,children,charges,sex,smoker,region_nw,region_se,region_sw,bmi30
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.21,30.66,1.09,13270.42,0.51,0.2,0.24,0.27,0.24,0.53
std,14.05,6.1,1.21,12110.01,0.5,0.4,0.43,0.45,0.43,0.5
min,18.0,15.96,0.0,1121.87,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.3,0.0,4740.29,0.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,9382.03,1.0,0.0,0.0,0.0,0.0,1.0
75%,51.0,34.69,2.0,16639.92,1.0,0.0,0.0,1.0,0.0,1.0
max,64.0,53.13,5.0,63770.43,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Analyse bedingter Mittelwert
dfc_bmi30_1 = dfc[dfc["bmi30"] == 1]
dfc_bmi30_0 = dfc[dfc["bmi30"] == 0]
print(dfc_bmi30_1["bmi"].mean())
print(dfc_bmi30_0["bmi"].mean())

35.26609618104667
25.506703645007924


In [10]:
# Multikollinearität überprüfen
X_vif = add_constant(dfc)
pd.Series([variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])], 
          index = X_vif.columns)

const        62.376319
age           1.381620
bmi           2.973033
children      1.013285
charges       4.096432
sex           1.009386
smoker        3.600862
region_nw     1.519916
region_se     1.662685
region_sw     1.534074
bmi30         2.844205
dtype: float64

In [11]:
# Korrelationsmatrix
dfc.corr().round(2)

Unnamed: 0,age,bmi,children,charges,sex,smoker,region_nw,region_se,region_sw,bmi30
age,1.0,0.11,0.04,0.3,-0.02,-0.03,-0.0,-0.01,0.01,0.09
bmi,0.11,1.0,0.01,0.2,0.05,0.0,-0.14,0.27,-0.01,0.8
children,0.04,0.01,1.0,0.07,0.02,0.01,0.02,-0.02,0.02,0.01
charges,0.3,0.2,0.07,1.0,0.06,0.79,-0.04,0.07,-0.04,0.2
sex,-0.02,0.05,0.02,0.06,1.0,0.08,-0.01,0.02,-0.0,0.05
smoker,-0.03,0.0,0.01,0.79,0.08,1.0,-0.04,0.07,-0.04,0.0
region_nw,-0.0,-0.14,0.02,-0.04,-0.01,-0.04,1.0,-0.35,-0.32,-0.08
region_se,-0.01,0.27,-0.02,0.07,0.02,0.07,-0.35,1.0,-0.35,0.17
region_sw,0.01,-0.01,0.02,-0.04,-0.0,-0.04,-0.32,-0.35,1.0,0.0
bmi30,0.09,0.8,0.01,0.2,0.05,0.0,-0.08,0.17,0.0,1.0


### 5 - Lineare Regression & Diagnostik

In [12]:
# Interaktionsterm definieren
dfc["bmi30*smoker"] = dfc["bmi30"] * dfc["smoker"]
dfc.head(5)

Unnamed: 0,age,bmi,children,charges,sex,smoker,region_nw,region_se,region_sw,bmi30,bmi30*smoker
0,19,27.9,0,16884.92,0,1,0,0,1,0,0
1,18,33.77,1,1725.55,1,0,0,1,0,1,0
2,28,33.0,3,4449.46,1,0,0,1,0,1,0
3,33,22.7,0,21984.47,1,0,1,0,0,0,0
4,32,28.88,0,3866.86,1,0,1,0,0,0,0


In [13]:
# Abhängige/Unabhängige Variablen definieren
X = dfc.drop(["charges"], 1)
y = dfc["charges"]

In [14]:
# Regressionsmodell schätzen
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     842.1
Date:                Wed, 25 May 2022   Prob (F-statistic):               0.00
Time:                        16:56:17   Log-Likelihood:                -13144.
No. Observations:                1338   AIC:                         2.631e+04
Df Residuals:                    1327   BIC:                         2.637e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -4744.9815    959.738     -4.944   

In [15]:
# Normalverteilung der Residuen
# Jarque-Bera Test mit H_0: Residuen sind normalverteilt
print("p-value = " + str(round(sms.jarque_bera(results.resid)[1], 2)))

p-value = 0.0


In [16]:
# Überprüfung Heteroskedastizität
# Goldfeld-Quandt Test mit H_0: Homoskedastizität liegt vor
print("p-value = " + str(round(sms.het_goldfeldquandt(results.resid, results.model.exog)[1], 2)))

p-value = 0.99


In [17]:
# Überprüfung Heteroskedastizität
# Breusch-Pagan Test mit H_0: Homoskedastizität liegt vor
print("p-value = " + str(round(sms.het_breuschpagan(results.resid, results.model.exog)[1], 2)))

p-value = 0.9


In [18]:
# Überprüfung Heteroskedastizität
# White Test mit H_0: Homoskedastizität liegt vor
print("p-value = " + str(round(sms.het_white(results.resid, results.model.exog)[1], 2)))

p-value = 0.97
