In [35]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures,StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error
scaler = StandardScaler(with_mean=False)
label = OneHotEncoder(handle_unknown='ignore')


In [36]:
df = pd.read_csv("cleaned_survey.csv")

In [37]:
df.columns

Index(['Age', 'Country', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'gender'],
      dtype='object')

In [38]:
cat_col = [ 'Country', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'gender']

num_col = ['Age']

In [39]:
cat_col_trans = Pipeline([('cat',label),('scale',scaler)])

In [40]:
preprocessor = ColumnTransformer([('categorical',cat_col_trans,cat_col)
                ])

In [41]:
linear = LinearRegression()
xgb = XGBRegressor(n_estimators=200,
    learning_rate=0.01,
    max_depth=6,
    reg_lambda=5,     
    reg_alpha=10,       
    gamma=0.9,      
    random_state=42)

In [42]:
x = df.drop(columns=['Age'])
y = df['Age']
x_train,x_test,y_train,y_test = train_test_split( x , y , train_size=0.8,random_state=42 )

In [43]:
linear_pipe = Pipeline([('preprocessor',preprocessor),('model',linear)])
linear_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [44]:
y_predict=linear_pipe.predict(x_test)
tup = (y_test,y_predict)

In [45]:
print(r2_score(*tup),mean_absolute_error(*tup),root_mean_squared_error(*tup))

0.04576609777210283 5.267044555404329 6.712805217985816


In [46]:
xgb_pipe = Pipeline([('preprocessor',preprocessor),('model',xgb)])
xgb_pipe.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [47]:
y_predict=xgb_pipe.predict(x_test)
tup = (y_test,y_predict)

In [48]:
print(r2_score(*tup),mean_absolute_error(*tup),root_mean_squared_error(*tup))

0.08033239841461182 5.230345249176025 6.59010124206543


In [49]:
import pickle

In [50]:
with open('regression_task_xgb.pkl','wb') as f:
    pickle.dump(xgb_pipe , f)

In [51]:
with open('regression_task_linear.pkl','wb') as f:
    pickle.dump(linear_pipe,f)