
🔹 کارهایی که باید انجام دهید:

تحلیل اکتشافی داده‌ها (EDA) و مصورسازی ارتباط ویژگی‌ها با هزینه‌ها

آماده‌سازی و پاک‌سازی داده‌ها (Encoding, Scaling, Handling Missing Values)

ساخت مدل‌های رگرشن

ارزیابی عملکرد مدل با معیارهایی مثل R² و RMSE

مستندسازی روند کار و نتایج


🔹 هدف پروژه:
کمک به شرکت‌های بیمه برای تخمین دقیق‌تر هزینه‌ها و همچنین تمرین عملی مهارت‌های داده‌کاوی و مدل‌سازی برای شما.

In [5]:
import pandas as pd
import numpy as np
from sklearn  import linear_model

In [6]:
df = pd.read_csv('medical_insurance.csv')
print(df.head(10))




   age  gender   bmi  children discount_eligibility     region  expenses  \
0   19  female  27.9         0                  yes  southwest  16884.92   
1   18    male  33.8         1                   no  southeast   1725.55   
2   28    male  33.0         3                   no  southeast   4449.46   
3   33    male  22.7         0                   no  northwest  21984.47   
4   32    male  28.9         0                   no  northwest   3866.86   
5   31  female  25.7         0                   no  southeast   3756.62   
6   46  female  33.4         1                   no  southeast   8240.59   
7   37  female  27.7         3                   no  northwest   7281.51   
8   37    male  29.8         2                   no  northeast   6406.41   
9   60  female  25.8         0                   no  northwest  28923.14   

    premium  
0  168.8492  
1   17.2555  
2   44.4946  
3  439.6894  
4   77.3372  
5   75.1324  
6  164.8118  
7  145.6302  
8  128.1282  
9  578.4628  


In [7]:

df_processed = df.copy()


In [8]:
# 1. تبدیل ستون 'discount_eligibility' به عدد (0 و 1)
# از تابع .map() استفاده می‌کنیم. 'yes' به 1 و 'no' به 0 تبدیل می‌شود.
df_processed['discount_eligibility'] = df_processed['discount_eligibility'].map({'yes': 1, 'no': 0})


# 2. تبدیل ستون 'gender' به عدد (0 و 1)
# 'male' به 1 و 'female' به 0 تبدیل می‌شود.
df_processed['gender'] = df_processed['gender'].map({'male': 1, 'female': 0})


# 3. تبدیل ستون 'region' با روش One-Hot Encoding
# از تابع کمکی get_dummies در پانداز استفاده می‌کنیم.
df_processed = pd.get_dummies(df_processed, columns=['region'], drop_first=True)



In [9]:
print(df_processed.head())

   age  gender   bmi  children  discount_eligibility  expenses   premium  \
0   19       0  27.9         0                     1  16884.92  168.8492   
1   18       1  33.8         1                     0   1725.55   17.2555   
2   28       1  33.0         3                     0   4449.46   44.4946   
3   33       1  22.7         0                     0  21984.47  439.6894   
4   32       1  28.9         0                     0   3866.86   77.3372   

   region_northwest  region_southeast  region_southwest  
0             False             False              True  
1             False              True             False  
2             False              True             False  
3              True             False             False  
4              True             False             False  


In [10]:
cdf = df_processed[['age','gender','bmi','children','discount_eligibility',
                    'expenses','premium','region_northwest','region_southeast',
                    'region_southwest']]
cdf.head(9)


Unnamed: 0,age,gender,bmi,children,discount_eligibility,expenses,premium,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.92,168.8492,False,False,True
1,18,1,33.8,1,0,1725.55,17.2555,False,True,False
2,28,1,33.0,3,0,4449.46,44.4946,False,True,False
3,33,1,22.7,0,0,21984.47,439.6894,True,False,False
4,32,1,28.9,0,0,3866.86,77.3372,True,False,False
5,31,0,25.7,0,0,3756.62,75.1324,False,True,False
6,46,0,33.4,1,0,8240.59,164.8118,False,True,False
7,37,0,27.7,3,0,7281.51,145.6302,True,False,False
8,37,1,29.8,2,0,6406.41,128.1282,False,False,False


In [11]:
msk = np.random.rand(len(df_processed))<0.8
train =  cdf[msk]
test =  cdf[~msk]

In [14]:
reg =  linear_model.LinearRegression()
train_x =  np.asanyarray(train[['age','gender','bmi','children','discount_eligibility',
                                'region_northwest','region_southeast','region_southwest']])
train_y =  np.asanyarray(train[['expenses','premium']])
reg.fit(train_x,train_y)
print ('Coefficients: ', reg.coef_)
print ('Intercept: ',reg.intercept_)


Coefficients:  [[ 2.60897592e+02 -3.07755186e+02  3.40424154e+02  5.58736102e+02
   2.42567546e+04 -5.51235972e+02 -1.17778619e+03 -1.18578485e+03]
 [ 1.12226914e+01 -8.94850068e+00  6.44605137e+00  6.10917492e+00
   4.66326638e+02 -6.12308628e+00 -1.19289801e+01 -1.29286728e+01]]
Intercept:  [-12103.22721517   -461.58927409]


In [15]:
from sklearn.metrics import r2_score

test_x = np.asanyarray(test[['age','gender',
                             'bmi','children',
                             'discount_eligibility',
                                'region_northwest',
                             'region_southeast',
                             'region_southwest']])
test_y = np.asanyarray(test[['expenses','premium']])
test_y_ = reg.predict(test_x)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y , test_y_) )

Mean absolute error: 2185.31
Residual sum of squares (MSE): 18068739.14
R2-score: 0.65
