In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install boruta
from boruta import BorutaPy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import mean_squared_error, r2_score
import warnings
from sklearn.feature_selection import RFE
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [None]:
im=pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')

In [None]:
im.head()

In [None]:
im.shape

In [None]:
im.tail()

In [None]:
im.info()

In [None]:
im.isnull().sum()

In [None]:
im.duplicated().sum()

In [None]:
im.nunique()

In [None]:
label_encode=LabelEncoder()

In [None]:
columns_to_encode = ['Gender','Marital Status','Education Level','Occupation','Location','Policy Type','Policy Start Date','Customer Feedback','Smoking Status','Exercise Frequency','Property Type']
for col in columns_to_encode:
    im[col] = label_encode.fit_transform(im[col])

In [None]:
im.fillna(im.mean(), inplace=True)

In [None]:
im.head(15)

In [None]:
im.isnull().sum()

In [None]:
x=im.drop(columns='Premium Amount',axis=1)
y=im['Premium Amount']

In [None]:
threshold=5
selector = VarianceThreshold(threshold)
selected_features = selector.fit_transform(im)
selected_feature_names = im.columns[selector.get_support()]

print("\nSelected Features:")
print(selected_feature_names)

In [None]:
mutual_info = mutual_info_classif(x, y)

# Print scores
for feature, score in zip(x.columns, mutual_info):
    print(f"Feature: {feature}, Mutual Information Score: {score:.2f}")

# Select top 2 features using SelectKBest
selector = SelectKBest(score_func=mutual_info_classif, k=2)
x_new = selector.fit_transform(x, y)

print("Selected features after Mutual Information:", x.columns[selector.get_support()])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize a regression model
model = LinearRegression()

# Apply RFE
n_features_to_select = 5  # Specify the number of features you want to keep
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
rfe.fit(x_train, y_train)

# Display results
print("Selected Features:", rfe.support_)
print("Feature Ranking:", rfe.ranking_)

In [None]:
im.head()

In [None]:
columns_to_drop = ['id','Annual Income','Policy Start Date','Customer Feedback','Property Type','Credit Score','Age','Vehicle Age']
im.drop(columns=columns_to_drop, inplace=True)

In [None]:
im.head(10)

In [None]:
im.shape

In [None]:
im = im.rename(columns={"Premium Amount": "Amount"})

In [None]:
sns.boxplot(data=im['Amount'])

In [None]:
sns.boxplot(im)

In [None]:
plt.hist(im)

In [None]:
x=im
y=im.Amount

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2 ,random_state=3)

In [None]:
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(im)

# Predict outliers (-1 indicates outliers)
outliers = iso_forest.predict(im)

# Create a boolean mask
boolean_mask = (outliers == 1)

# Ensure indices match
data = im.reset_index(drop=True)

# Apply the mask
filtered_data = im[boolean_mask]

In [None]:
print(len(boolean_mask))
print(len(im))

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Fit a basic regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Compute residuals
predictions = model.predict(x_train)
residuals = np.abs(y_train - predictions)

# Define a threshold for outliers (e.g., 3 standard deviations)
threshold = 3 * np.std(residuals)
outliers = residuals > threshold

# Filter out outliers
x_filtered = x_train[~outliers]
y_filtered = y_train[~outliers]

In [None]:
print(x.shape,x_train.shape,x_test.shape)

In [None]:
print(im.Amount.std())

In [None]:
scaler=StandardScaler()

In [None]:
scaler.fit(x_train)

In [None]:
x_train_standarized=scaler.transform(x_train)

In [None]:
print(x_train_standarized)

In [None]:
x_test_standarized=scaler.transform(x_test)
print(x_test_standarized)

In [None]:
print(x_train_standarized.std())
print(x_test_standarized.std())

In [None]:
im['Amount'].value_counts()

In [None]:
X = im.drop(columns='Amount', axis=1)
Y = im['Amount']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
X_train_prediction = model.predict(X_train)

In [None]:
mse = mean_squared_error(Y_train, X_train_prediction)
r2 = r2_score(Y_train, X_train_prediction)

In [None]:
print("Mean Squared Error on training data: ", mse)
print("R² Score on training data: ", r2)

In [None]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Before removing outliers
sns.boxplot(data=im)
plt.title("Before Removing Outliers")
plt.show()

# After removing outliers
sns.boxplot(data=filtered_data)
plt.xticks(rotation=45)  # Rotate labels by 45 degrees
plt.title("After Removing Outliers")
plt.show()


In [None]:
sns.boxplot(data=im['Policy Type'])

In [None]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.5, random_state=42)  # Adjust contamination
iso_forest.fit(data)

outliers = iso_forest.predict(data)
boolean_mask = outliers == 1  # Filter inliers
filtered_data = data[boolean_mask]

In [None]:
import xgboost as xgb

# Initialize XGBoost Regressor
xgb_regressor = xgb.XGBRegressor(random_state=42)

# Train the model
xgb_regressor.fit(x_train, y_train)

# Predict and evaluate
y_pred = xgb_regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

# Evaluate R² score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

In [None]:
from sklearn.metrics import mean_squared_error

# Evaluate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

In [None]:
from sklearn.metrics import mean_absolute_error

# Evaluate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Before removing outliers
sns.boxplot(data=im)
plt.title("Before Removing Outliers")
plt.show()

# After removing outliers
sns.boxplot(data=filtered_data)
plt.xticks(rotation=45)  # Rotate labels by 45 degrees
plt.title("After Removing Outliers")
plt.show()
