In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import json

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Step 2: Load the JSON file containing underwriting reports
with open('underwritings.json', 'r') as file:
    data = json.load(file)

# Convert JSON to DataFrame
underwriting_df = pd.DataFrame(data)
underwriting_df

In [None]:
# Step 3: Exploratory Data Analysis
# print(underwriting_df.head())

In [None]:
# print(underwriting_df.info())

In [None]:
# print(underwriting_df.describe())

In [None]:
# print("Columns in underwriting_df:", underwriting_df.columns)

In [None]:
underwriting_df.isna().sum()

In [None]:
len(underwriting_df.columns)

In [None]:
# Calculate the percentage of non-missing values in each column
column_non_missing_percentage = (underwriting_df.notnull().mean() * 100)

# Drop columns with less than 70% non-missing values
columns_to_drop = column_non_missing_percentage[column_non_missing_percentage < 60].index
print("columns_to_drop", columns_to_drop)
underwriting_df = underwriting_df.drop(columns=columns_to_drop)

In [None]:
len(underwriting_df.columns)

In [None]:
underwriting_df.isna().sum()

In [None]:
# Identify numeric columns
numeric_cols = underwriting_df.select_dtypes(include=['int', 'float']).columns

# Fill missing values in object columns with 'unknown'
object_columns = underwriting_df.select_dtypes(include='object').columns
underwriting_df[object_columns] = underwriting_df[object_columns].fillna(
    'unknown')

In [None]:
# underwriting_df.duplicated().sum()

In [None]:
# underwriting_df

## Analysis

In [None]:
# Assuming 'target_feature' is the name of your target feature
target_correlation = underwriting_df.corrwith(
    underwriting_df['clearfraudscore']).abs()

# Set the threshold for correlation with the target feature
threshold = 0.1  # You can adjust this threshold as needed

# Find columns with correlation below the threshold
low_correlation_columns = target_correlation[target_correlation <
                                             threshold].index
print("low_correlation_columns", low_correlation_columns)
print("len(low_correlation_columns)", len(low_correlation_columns))

# Drop the low correlation columns
underwriting_df = underwriting_df.drop(columns=low_correlation_columns)

In [None]:
len(underwriting_df.columns)

In [None]:
len(underwriting_df)

In [None]:
# Separate features (X) and target variable (y)
X = underwriting_df.drop(columns=['clearfraudscore'])
y = underwriting_df['clearfraudscore']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
# Further split the training data into training and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Check for missing values in y_train
missing_values_y = y_train.isnull().sum()

if missing_values_y > 0:
    # Get indices of rows with missing values in y_train
    missing_indices = y_train[y_train.isnull()].index

    # Drop corresponding rows from X_train and y_train
    X_train = X_train.drop(index=missing_indices)
    y_train = y_train.drop(index=missing_indices)

    # Reset index after dropping rows
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

In [None]:
# y_val.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

numeric_cols = underwriting_df.drop(columns=['clearfraudscore']).select_dtypes(
    include=['int', 'float']).columns
imputer = SimpleImputer(strategy='mean')

# print("underwriting_df[numeric_cols]:", underwriting_df[numeric_cols])
print("underwriting_df[numeric_cols].isna().sum():",
      underwriting_df[numeric_cols].isna().sum())
underwriting_df[numeric_cols].isna().sum()

imputer.fit(underwriting_df[numeric_cols])

In [None]:
# list(imputer.statistics_)

In [None]:
X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

In [None]:
# Identify numeric columns
numeric_cols = X_train.select_dtypes(include=['int', 'float']).columns
X_train[numeric_cols].describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(underwriting_df[numeric_cols])

In [None]:
print('Minimum:')
list(scaler.data_min_)

In [None]:
print('Maximum:')
list(scaler.data_max_)

In [None]:
X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [None]:
X_train[numeric_cols].describe()

In [None]:
# Identify categorical columns
categorical_cols = [col for col in X_train.columns if col !=
                    'underwritingid' and X_train[col].dtype == 'object']
categorical_cols

In [None]:
# Convert all values in categorical columns to strings
underwriting_df[categorical_cols] = underwriting_df[categorical_cols].astype(
    str)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# # One-hot encode categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_cols])

# # Get the feature names for the encoded columns
encoded_cols = list(encoder.get_feature_names_out(
    input_features=categorical_cols))

# # Transform the training and test data
X_train_encoded = encoder.transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# # Create DataFrames from the encoded data
X_train_encoded_df = pd.DataFrame(
    X_train_encoded, columns=encoded_cols, index=X_train.index)
X_test_encoded_df = pd.DataFrame(
    X_test_encoded, columns=encoded_cols, index=X_test.index)

# # Drop the underwritingid column
X_train = X_train.drop(columns=['underwritingid'])
X_test = X_test.drop(columns=['underwritingid'])

# # Concatenate the encoded features with the original data
X_train = pd.concat(
    [X_train.drop(columns=categorical_cols), X_train_encoded_df], axis=1)
X_test = pd.concat(
    [X_test.drop(columns=categorical_cols), X_test_encoded_df], axis=1)

In [None]:
# encoded_cols

In [None]:
len(X_train.columns)

In [None]:
X_train[numeric_cols].describe()

In [None]:
# Find the indices of rows with missing values in y_test
indices_to_remove = y_test.index[y_test.isna()]

# Drop rows with missing values from both X_test and y_test
X_test = X_test.drop(indices_to_remove)
y_test = y_test.drop(indices_to_remove)

In [None]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

# from sklearn.linear_model import Ridge
# from sklearn.linear_model import Lasso
# from sklearn.linear_model import ElasticNet
# from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
# import lightgbm as lgb
# from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Instantiate models
# linear_reg = LinearRegression()
# decision_tree = DecisionTreeRegressor()
# random_forest = RandomForestRegressor()
# gradient_boosting = GradientBoostingRegressor()
# svm = SVR()

# ridge_reg = Ridge()
# lasso_reg = Lasso()
# elastic_net_reg = ElasticNet()
# knn_reg = KNeighborsRegressor()
xgb_reg = xgb.XGBRegressor()
# lgb_reg = lgb.LGBMRegressor()

In [None]:
# Define a list of models
# models = [linear_reg, decision_tree, random_forest, gradient_boosting, svm, ridge_reg, lasso_reg, elastic_net_reg, knn_reg, lgb_reg]

model = xgb_reg

In [None]:
from sklearn.model_selection import cross_val_score

model.fit(X_train, y_train)

# Perform cross-validation on the training data
# cv_scores_train = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
# rmse_scores_train = np.sqrt(-cv_scores_train)

# Calculate RMSE on the training data
rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

# Calculate RMSE on the test data
rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

# print("Training RMSE Scores:", rmse_scores_train)
# print("Training Mean RMSE:", np.mean(rmse_scores_train))
print("Training RMSE:", rmse_train)
print("Test RMSE:", rmse_test)

In [None]:
# Load the test data
test_data = pd.read_csv("test.csv")

# Find columns to drop, excluding 'clearfraudscore'
# to_drop_test = [column for column in high_correlation.columns if column != 'clearfraudscore' and any(high_correlation[column])]

In [None]:
# Drop the columns
# test_data = test_data.drop(columns=to_drop_test)

test_data = test_data.drop(columns=low_correlation_columns)

# Assuming your DataFrame is named df
test_data.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Identify numeric columns
numeric_cols = test_data.select_dtypes(include=['int', 'float']).columns
# print(numeric_cols)

# Preprocess the test data
imputer = SimpleImputer(strategy='mean')
test_data[numeric_cols].isna().sum()

imputer.fit(test_data[numeric_cols])
test_data[numeric_cols] = imputer.transform(test_data[numeric_cols])

scaler.fit(test_data[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

categorical_cols = [col for col in test_data.columns if col !=
                    'underwritingid' and test_data[col].dtype == 'object']

# Convert all values in categorical columns to strings
test_data[categorical_cols] = test_data[categorical_cols].astype(str)

# print(categorical_cols)

# One-hot encode categorical columns
test_encoded = encoder.transform(test_data[categorical_cols])

# Create DataFrame from the encoded data
test_encoded_df = pd.DataFrame(
    test_encoded, columns=encoded_cols, index=test_data.index)

# Drop the 'underwritingid' column (if present)
if 'underwritingid' in test_data.columns:
    test_data = test_data.drop(columns=['underwritingid'])

# Concatenate the encoded features with the original test data
test_data = pd.concat(
    [test_data.drop(columns=categorical_cols), test_encoded_df], axis=1)

# test_data

In [None]:
# Make predictions using each trained model
test_predictions = model.predict(test_data)

In [None]:
import pandas as pd

# Read the submission CSV file into a DataFrame
submission_df = pd.read_csv('submission.csv')

print(len(test_data))
print(len(submission_df))
print(len(test_predictions))

# Check if the length of test_predictions matches the number of rows in submission_df
if len(test_predictions) == len(submission_df):
    # Add test_predictions as a new column named 'expected'
    submission_df['expected'] = test_predictions

    # Write the updated DataFrame back to the CSV file
    submission_df.to_csv('submission.csv', index=False)
else:
    print("Length of test_predictions doesn't match the number of rows in submission.csv")