<a href="https://colab.research.google.com/github/JelanKhweileh/Data-Mining-Project-Health-Insurance/blob/main/DMPROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(train_data.head())
print(train_data.info())
oversampler = RandomOverSampler(random_state=42)
X_ros, y_ros = oversampler.fit_resample(train_data.drop(['id', 'Response'], axis=1),
                                                    train_data['Response'])

X_train, X_vali, y_train, y_vali = train_test_split(X_ros, y_ros,
                                                  test_size=0.2,
                                                  random_state=42)
param_dist = {

   'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': randint(3, 7),
    'min_child_weight': randint(1, 5),
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200, 300]

}

model = xgb.XGBClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50,
                                   cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
best_model.fit(X_train, y_train)
vali_predictions = best_model.predict(X_vali)
print("Accuracy on Validation Set:", accuracy_score(y_vali, vali_predictions))
print("Classification Report:")
print(classification_report(y_vali, vali_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_vali, vali_predictions))
test_predictions = best_model.predict(test_data.drop('id', axis=1))
submission_df = pd.DataFrame({'id': test_data['id'], 'Response': test_predictions})
submission_df.to_csv('542submission_xgboost_oversampling_random_search.csv', index=False)
print(submission_df.head())

   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   0       0   30                1         28.0                   1   
1   1       0   26                1          6.0                   1   
2   2       0   40                1          0.0                   0   
3   3       0   25                1          8.0                   1   
4   4       1   26                1         28.0                   1   

   Vehicle_Age  Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0            2               0         60954.0                 152.0      127   
1            2               0         24532.0                 152.0      216   
2            1               1          2630.0                  47.0      220   
3            2               0         44259.0                 152.0      223   
4            2               0         33615.0                 152.0      194   

   Response  
0         0  
1         0  
2         0  
3         0  
4         