In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('data5.csv')

# Show the first few rows of the dataframe to verify the columns
print("Original DataFrame:")
print(df.head())

# Initialize the LabelEncoder
le = LabelEncoder()

# List of columns to encode
columns_to_encode = ['EmploymentStatus', 'MaritalStatus', 'HomeOwnershipStatus']

# Apply label encoding to the specified columns
for column in columns_to_encode:
    if column in df.columns:
        df[column] = le.fit_transform(df[column])
        print(f"Encoded '{column}' with mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    else:
        print(f"'{column}' column not found in the dataset.")

# Display the transformed DataFrame
print("\nTransformed DataFrame:")
print(df.head())


Original DataFrame:
   Age  AnnualIncome  CreditScore EmploymentStatus  LoanAmount  LoanDuration  \
0   45         39948          617         Employed       13152            48   
1   38         39709          628         Employed       26045            48   
2   47         40724          570         Employed       17627            36   
3   58         69084          545         Employed       37898            96   
4   37        103264          594         Employed        9184            36   

  MaritalStatus  NumberOfDependents HomeOwnershipStatus  BankruptcyHistory  \
0       Married                   2                 Own                  0   
1        Single                   1            Mortgage                  0   
2       Married                   2                Rent                  0   
3        Single                   1            Mortgage                  0   
4       Married                   1            Mortgage                  0   

   PaymentHistory  TotalAssets

In [2]:
# Handling missing values by filling with median for numerical columns
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())

# Handling missing values by filling with mode for categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check if the missing values are filled
df.isnull().sum()


Age                    0
AnnualIncome           0
CreditScore            0
EmploymentStatus       0
LoanAmount             0
LoanDuration           0
MaritalStatus          0
NumberOfDependents     0
HomeOwnershipStatus    0
BankruptcyHistory      0
PaymentHistory         0
TotalAssets            0
TotalLiabilities       0
MonthlyIncome          0
NetWorth               0
MonthlyLoanPayment     0
RiskScorePercentage    0
dtype: int64

In [3]:
import pandas as pd

# Assuming you already have a DataFrame 'df' with the necessary columns
# Handle missing values (as we did before)
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical variables using get_dummies (one-hot encoding)
df_encoded = pd.get_dummies(df, drop_first=True)


In [4]:
# Assuming 'RiskScore' is the target
X = df_encoded.drop(columns=['RiskScorePercentage'])
y = df_encoded['RiskScorePercentage']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [5]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Dictionary to store models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Initialize a dictionary to store the results
results = {}

# Iterate over each model, train it, and evaluate
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[name] = {
        'Mean Squared Error': mse,
        'R-squared': r2
    }

# Print the results for all models
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Mean Squared Error: {metrics['Mean Squared Error']:.2f}")
    print(f"  R-squared: {metrics['R-squared']:.2f}")
    print()




Linear Regression:
  Mean Squared Error: 44.31
  R-squared: 0.63

Ridge Regression:
  Mean Squared Error: 44.31
  R-squared: 0.63

Lasso Regression:
  Mean Squared Error: 57.24
  R-squared: 0.52

Random Forest:
  Mean Squared Error: 1.04
  R-squared: 0.99

Gradient Boosting:
  Mean Squared Error: 0.47
  R-squared: 1.00



In [6]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

In [7]:
import pandas as pd

def predict_risk():
    # Predefined inputs for risk assessment
    Age = 70
    AnnualIncome = 13500
    CreditScore = 536
    LoanAmount = 670000
    LoanDuration = 3  # in months
    NumberOfDependents = 0
    TotalAssets = 1000000
    TotalLiabilities = 1000
    MonthlyIncome = 87000
    NetWorth = 150000
    MonthlyLoanPayment = 0
    BankruptcyHistory = 0



    # Categorical variables encoded according to label encoding
    EmploymentStatus = 0  # For employed
    MaritalStatus = 1     # For married
    HomeOwnershipStatus = 1  # For own

    # Construct the input DataFrame with the correct feature names and order
    user_input = pd.DataFrame({
        'Age': [Age],
        'AnnualIncome': [AnnualIncome],
        'CreditScore': [CreditScore],
        'EmploymentStatus': [EmploymentStatus],  # Encoded EmploymentStatus
        'LoanAmount': [LoanAmount],
        'LoanDuration': [LoanDuration],
        'MaritalStatus': [MaritalStatus],        # Encoded MaritalStatus
        'NumberOfDependents': [NumberOfDependents],
        'HomeOwnershipStatus': [HomeOwnershipStatus],  # Encoded HomeOwnershipStatus
        'BankruptcyHistory': [BankruptcyHistory],  # No bankruptcy
        'PaymentHistory': [1],  # Good payment history
        'TotalAssets': [TotalAssets],
        'TotalLiabilities': [TotalLiabilities],
        'MonthlyIncome': [MonthlyIncome],
        'NetWorth': [NetWorth],
        'MonthlyLoanPayment': [MonthlyLoanPayment],

    })

    # Ensure the feature names and order match the model
    user_input = user_input[gb_model.feature_names_in_]

    # Predict the risk score using the trained model
    predicted_risk = gb_model.predict(user_input)

    # Output the prediction
    print(f"Predicted Risk Score: {predicted_risk[0]:.2f}")

# Example to run the function
predict_risk()


Predicted Risk Score: 85.73


In [8]:
print(gb_model.feature_names_in_)


['Age' 'AnnualIncome' 'CreditScore' 'EmploymentStatus' 'LoanAmount'
 'LoanDuration' 'MaritalStatus' 'NumberOfDependents' 'HomeOwnershipStatus'
 'BankruptcyHistory' 'PaymentHistory' 'TotalAssets' 'TotalLiabilities'
 'MonthlyIncome' 'NetWorth' 'MonthlyLoanPayment']


In [13]:
import joblib

# Assuming rf_model is your trained model
filename = 'Label2.pkl'
joblib.dump(gb_model, filename)

print(f"Model saved as {filename}")


Model saved as Label2.pkl


In [10]:
import joblib
joblib.dump(model,r'C:\Users\adrit\OneDrive\Desktop\Vcet\model\Label.pkl')

['C:\\Users\\adrit\\OneDrive\\Desktop\\Vcet\\model\\Label.pkl']

In [11]:
import joblib
model = joblib.load(r'C:\Users\adrit\OneDrive\Desktop\Vcet\model\Label.pkl')

In [12]:
print(type(model))

<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
