In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_selector

In [None]:
#;ethode pour importer les donnes via le lien seulement
!wget https://www.kaggle.com/datasets/lorenzozoppelletto/financial-risk-for-loan-approval/data

--2024-09-27 14:46:03--  https://www.kaggle.com/datasets/lorenzozoppelletto/financial-risk-for-loan-approval/data
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘data’

data                    [ <=>                ]  11.77K  --.-KB/s    in 0.04s   

2024-09-27 14:46:04 (276 KB/s) - ‘data’ saved [12050]



In [None]:
import pandas as pd
donnees= pd.read_csv('/content/sample_data/Loan.csv')
donnees.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


In [None]:
#verification des valeurs manquantes
donnees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             20000 non-null  object 
 1   Age                         20000 non-null  int64  
 2   AnnualIncome                20000 non-null  int64  
 3   CreditScore                 20000 non-null  int64  
 4   EmploymentStatus            20000 non-null  object 
 5   EducationLevel              20000 non-null  object 
 6   Experience                  20000 non-null  int64  
 7   LoanAmount                  20000 non-null  int64  
 8   LoanDuration                20000 non-null  int64  
 9   MaritalStatus               20000 non-null  object 
 10  NumberOfDependents          20000 non-null  int64  
 11  HomeOwnershipStatus         20000 non-null  object 
 12  MonthlyDebtPayments         20000 non-null  int64  
 13  CreditCardUtilizationRate   200

In [None]:
# Split data into training and test sets
X_regression = donnees.drop(["RiskScore"], axis=1)
y_regression = donnees["RiskScore"]
y_classification = donnees["LoanApproved"]


In [None]:
from sklearn.model_selection import train_test_split
X_regression_train, X_regression_test, y_regression_train, y_regression_test = train_test_split(X_regression , y_regression, test_size=0.2, random_state=42) #20% data pour le test and 80%for trqining

In [None]:
#Régression du Score de Risque : Utilisation d'un modèle Random Forest Regressor
import numpy as np
# Create a pipeline for regression
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# Train the model
regression_pipeline.fit(X_regression_train, y_regression_train)

In [None]:
# Make predictions
y_regression_pred = regression_pipeline.predict(X_regression_test)
y_regression_pred

array([42.288, 38.424, 52.99 , ..., 51.   , 50.81 , 50.99 ])

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_regression_test, y_regression_pred)
rmse = np.sqrt(mean_squared_error(y_regression_test, y_regression_pred))

print(f"Mean Absolute Error (Regression): {mae}")
print(f"Root Mean Squared Error (Regression): {rmse}")

Mean Absolute Error (Regression): 0.5925860000000003
Root Mean Squared Error (Regression): 0.9440987151776029


**for the classification part:**

In [None]:
X_classification = donnees.drop(["LoanApproved"], axis=1)
y_classification = donnees["LoanApproved"]

In [None]:
# Split data into training and test sets
X_classification_train, X_classification_test, y_classification_train, y_classification_test = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

In [None]:
#classification pipeline
classification_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
#the classification model
classification_pipeline.fit(X_classification_train, y_classification_train)

In [None]:
y_classification_pred = classification_pipeline.predict(X_classification_test)
y_classification_pred

array([1, 1, 0, ..., 0, 0, 0])

In [None]:
#classification model evaluation
accuracy = classification_pipeline.score(X_classification_test, y_classification_test)
accuracy

0.97525

In [None]:
roc_auc = roc_auc_score(y_classification_test, y_classification_pred)
roc_auc

0.9542436639482139

l'accuracy=0.9 ceci montre que le modele est bon
**Remarque:**
- vous pouvez ajouter l'étude de la corrélation entre RiskScore et LoanApproved
- vous pouvez ajouter des étiquettes aux classes: au lieu de 0 faire refusé et au lieu de 1 faire accordé