In [1]:
# Import necessary libraries
import pandas as pd


In [2]:
# Load the dataset, skipping the first row and assigning column names manually
df = pd.read_csv(r"C:\Users\Chrizel\Documents\Copy of default of credit card clients.csv", header=None, skiprows=1)

In [3]:
df.columns = [
    "ID", "LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE",
    "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6",
    "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6",
    "default payment next month"
]


In [4]:
# Convert all columns to numeric, coerce errors to NaN
df = df.apply(pd.to_numeric, errors='coerce')

In [5]:
# Display the first 5 rows, data types, and missing value count
print(df.head())
print(df.dtypes)


    ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE   AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0  NaN        NaN  NaN        NaN       NaN   NaN    NaN    NaN    NaN    NaN   
1  1.0    20000.0  2.0        2.0       1.0  24.0    2.0    2.0   -1.0   -1.0   
2  2.0   120000.0  2.0        2.0       2.0  26.0   -1.0    2.0    0.0    0.0   
3  3.0    90000.0  2.0        2.0       2.0  34.0    0.0    0.0    0.0    0.0   
4  4.0    50000.0  2.0        2.0       1.0  37.0    0.0    0.0    0.0    0.0   

   ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...        NaN        NaN        NaN       NaN       NaN       NaN   
1  ...        0.0        0.0        0.0       0.0     689.0       0.0   
2  ...     3272.0     3455.0     3261.0       0.0    1000.0    1000.0   
3  ...    14331.0    14948.0    15549.0    1518.0    1500.0    1000.0   
4  ...    28314.0    28959.0    29547.0    2000.0    2019.0    1200.0   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment next month  
0       NaN 

In [27]:
print(df.isnull().sum())  

# Drop rows with missing values
df = df.dropna()


ID                            1
LIMIT_BAL                     1
SEX                           1
EDUCATION                     1
MARRIAGE                      1
AGE                           1
PAY_0                         1
PAY_2                         1
PAY_3                         1
PAY_4                         1
PAY_5                         1
PAY_6                         1
BILL_AMT1                     1
BILL_AMT2                     1
BILL_AMT3                     1
BILL_AMT4                     1
BILL_AMT5                     1
BILL_AMT6                     1
PAY_AMT1                      1
PAY_AMT2                      1
PAY_AMT3                      1
PAY_AMT4                      1
PAY_AMT5                      1
PAY_AMT6                      1
default payment next month    1
dtype: int64


In [28]:
# Separate features (X) and target variable (y)
X = df.drop(columns=["ID", "default payment next month"])
y = df["default payment next month"]


In [29]:
# Import train_test_split to split data
from sklearn.model_selection import train_test_split

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [30]:
# Import necessary tools for scaling and logistic regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model with scaled data
model = LogisticRegression(max_iter=5000)
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Accuracy: 0.8100
Precision: 0.6927
Recall: 0.2369


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Train on scaled data (or you can use unscaled, RF works fine without scaling)
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest Precision: {precision_rf:.4f}")
print(f"Random Forest Recall: {recall_rf:.4f}")


Random Forest Accuracy: 0.8160
Random Forest Precision: 0.6380
Random Forest Recall: 0.3679


# Credit Scoring Model - Internship Task Report

## Objective
Build a machine learning model to predict a person’s creditworthiness using historical financial data. The goal is to classify whether a client will default on their credit card payment next month.



##  Dataset
The dataset contains 30,001 records of credit card clients with features such as:

- `LIMIT_BAL`: Credit limit  
- `SEX`, `EDUCATION`, `MARRIAGE`, `AGE`: Demographic info  
- `PAY_0` to `PAY_6`: Payment status history  
- `BILL_AMT1` to `BILL_AMT6`: Bill statement amounts  
- `PAY_AMT1` to `PAY_AMT6`: Payment amounts  
- `default payment next month`: Target variable (0 = no default, 1 = default)



##  Data Preprocessing
- Loaded the data and assigned column names  
- Converted all features to numeric and dropped missing values  
- Split data into features (`X`) and target (`y`)  
- Applied train-test split (80% training, 20% testing)  
- Used `StandardScaler` for feature scaling (required for Logistic Regression)



##  Models Used
- **Logistic Regression**  
- **Random Forest Classifier**



## Evaluation Metrics
- **Accuracy**: Correct predictions out of total samples  
- **Precision**: How many predicted defaults were actual defaults  
- **Recall**: How many actual defaults were correctly predicted



##  Results

| Model               | Accuracy | Precision | Recall  |
|---------------------|----------|-----------|---------|
| Logistic Regression | 0.8100   | 0.6927    | 0.2369  |
| Random Forest       | 0.8160   | 0.6380    | 0.3679  |



##  Analysis
- Random Forest performed slightly better in both accuracy and recall  
- Logistic Regression had slightly better precision  
- Random Forest is more effective for identifying defaulters (better recall)



##  Conclusion
The task was successfully completed using Python, pandas, and scikit-learn.  
Both models were trained and evaluated. Random Forest showed better performance in catching defaulters, which is critical in credit risk assessment.  
Further improvements could include hyperparameter tuning and feature engineering.
