In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv('expenses.csv')
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
data.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


## Data preprocessing

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
column_to_ohe = ['sex','smoker','region']
ohe = OneHotEncoder(handle_unknown='ignore')

In [7]:
num_data = data.drop(['sex','smoker','region','charges'],axis=1)

In [8]:
ohe_column = ohe.fit_transform(data[column_to_ohe])

In [9]:
X_transformed = pd.DataFrame.sparse.from_spmatrix(ohe_column).astype('int32')
X_transformed

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [10]:
X = pd.concat([num_data,X_transformed],axis=1)
y = data['charges']

In [11]:
X.columns = X.columns.astype('str')

## Model

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=0)

In [13]:
lr = LinearRegression()

In [14]:
lr.fit(X_train,y_train)

In [15]:
lr.score(X_test,y_test)

0.8216781468713605

In [16]:
pred = lr.predict(X_test)

In [17]:
print(f'Maksymalny błąd wynosi: {max_error(y_test,pred)}')

Maksymalny błąd wynosi: 20922.555059845974


In [18]:
print(f'Średni błąd bezwzględny(MAE) wynosi: {mean_absolute_error(y_test,pred)}')

Średni błąd bezwzględny(MAE) wynosi: 4252.759443161083


In [19]:
print(f'Błąd średniokwadratowy(MSE) wynosi: {mean_squared_error(y_test,pred)}')
print(f'Pierwiastek błędu średniokwadratowego wynosi: {mean_squared_error(y_test,pred,squared=False)}')

Błąd średniokwadratowy(MSE) wynosi: 34577221.797758825
Pierwiastek błędu średniokwadratowego wynosi: 5880.239943893346
