In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Load the dataset
df = pd.read_csv('gga.csv')
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [4]:
df.fillna(df.mean(), inplace=True)


In [5]:
# Split data into features and target for both models
X = df.drop(columns=['Personal Loan', 'ID'])
y_loan = df['Personal Loan']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# 1. Drop irrelevant columns
df_cleaned = df.drop(columns=['ID', 'ZIP Code'])


In [7]:
# 3. Feature Scaling using Standardization (Z-score normalization)
scaler = StandardScaler()
scaled_columns = ['Income', 'CCAvg', 'Mortgage']
df_cleaned[scaled_columns] = scaler.fit_transform(df_cleaned[scaled_columns])

In [8]:
# 4. Outlier Detection and Removal
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


In [9]:
df_cleaned = remove_outliers(df_cleaned, scaled_columns)

In [10]:
# 5. Feature Engineering

# 1. Income per Family Member
df_cleaned['Income_Per_Family'] = df_cleaned['Income'] / df_cleaned['Family']

# 2. Experience Level (Example thresholds: Junior: 0-5 years, Mid: 6-15 years, Senior: 16+ years)
def experience_level(experience):
    if experience <= 5:
        return 'Junior'
    elif 6 <= experience <= 15:
        return 'Mid'
    else:
        return 'Senior'

df_cleaned['Experience_Level'] = df_cleaned['Experience'].apply(experience_level)

# Convert Experience_Level to dummy variables
df_cleaned = pd.get_dummies(df_cleaned, columns=['Experience_Level'], drop_first=True)

# 3. High Mortgage Indicator
# Assuming a threshold of $100,000 (normalized value will depend on your data)
df_cleaned['High_Mortgage'] = (df_cleaned['Mortgage'] > 100000).astype(int)

# Check the newly engineered features
print("Engineered Data Head:")
print(df_cleaned.head())


Engineered Data Head:
   Age  Experience    Income  Family     CCAvg  Education  Mortgage  \
0   25           1 -0.538229       4 -0.193385          1 -0.555524   
1   45          19 -0.864109       3 -0.250611          1 -0.555524   
2   39          15 -1.363793       1 -0.536736          1 -0.555524   
3   35           9  0.569765       1  0.436091          2 -0.555524   
4   35           8 -0.625130       4 -0.536736          2 -0.555524   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  \
0              0                   1           0       0           0   
1              0                   1           0       0           0   
2              0                   0           0       0           0   
3              0                   0           0       0           0   
4              0                   0           0       0           1   

   Income_Per_Family  Experience_Level_Mid  Experience_Level_Senior  \
0          -0.134557                 False     

In [11]:
# 6. Splitting the Data
X = df_cleaned.drop(columns=['Personal Loan'])  # Features
y = df_cleaned['Personal Loan']  # Target


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print("Cleaned Data Head:")
print(df_cleaned.head())

print("\nTraining Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)

Cleaned Data Head:
   Age  Experience    Income  Family     CCAvg  Education  Mortgage  \
0   25           1 -0.538229       4 -0.193385          1 -0.555524   
1   45          19 -0.864109       3 -0.250611          1 -0.555524   
2   39          15 -1.363793       1 -0.536736          1 -0.555524   
3   35           9  0.569765       1  0.436091          2 -0.555524   
4   35           8 -0.625130       4 -0.536736          2 -0.555524   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  \
0              0                   1           0       0           0   
1              0                   1           0       0           0   
2              0                   0           0       0           0   
3              0                   0           0       0           0   
4              0                   0           0       0           1   

   Income_Per_Family  Experience_Level_Mid  Experience_Level_Senior  \
0          -0.134557                 False        

# Advanced Feature Engineering 

In [15]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Convert the polynomial features back to a DataFrame to keep track of feature names
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

# 2. Principal Component Analysis (PCA)
pca = PCA(n_components=10)  # Reduce to 10 principal components
X_pca = pca.fit_transform(X_poly_df)

# Convert the PCA features back to a DataFrame
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# View the updated dataset with PCA features
print("PCA-Transformed Dataset:")
print(X_pca_df.head())

# Save the PCA-transformed dataset to a CSV file
X_pca_df.to_csv('pca_transformed_data.csv', index=False)

PCA-Transformed Dataset:
           PC1        PC2        PC3        PC4        PC5        PC6  \
0 -1034.655776  14.053335  13.452737 -32.697047  -8.341947   9.619337   
1  -200.393931  38.359331   2.764505 -45.391106 -21.120683  21.390377   
2  -476.788419 -26.527191 -74.954644 -53.133604 -17.773972  29.005254   
3  -746.225186 -63.618141  10.831408  16.329253  -8.697958  11.159342   
4  -776.134639  56.548874  10.669941  -1.513723 -15.289122   0.518556   

         PC7        PC8        PC9       PC10  
0 -15.883611  -6.754720 -12.703890  20.887094  
1 -23.180241 -10.126387 -16.473498  44.873976  
2 -18.098243 -10.224656  32.474836  -6.049472  
3 -18.858855  -8.854512  21.053425  -4.965504  
4 -19.622399  26.179684  21.310067  -6.965651  


In [16]:
 #3. Train-test split using PCA-transformed data
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_df, y, test_size=0.2, random_state=42)

# 4. Train the model using the PCA-transformed data
model_pca = LogisticRegression(random_state=42)
model_pca.fit(X_train_pca, y_train)

# 5. Make Predictions
y_pred_pca = model_pca.predict(X_test_pca)

# 6. Evaluate the Model
print("Confusion Matrix (PCA):")
print(confusion_matrix(y_test, y_pred_pca))

print("\nClassification Report (PCA):")
print(classification_report(y_test, y_pred_pca))

print("Accuracy Score (PCA):", accuracy_score(y_test, y_pred_pca))
 



Confusion Matrix (PCA):
[[831   6]
 [ 18  17]]

Classification Report (PCA):
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       837
           1       0.74      0.49      0.59        35

    accuracy                           0.97       872
   macro avg       0.86      0.74      0.79       872
weighted avg       0.97      0.97      0.97       872

Accuracy Score (PCA): 0.9724770642201835


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_pca, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_pca)

# Evaluate the model
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print("Accuracy Score (Random Forest):", accuracy_score(y_test, y_pred_rf))


Confusion Matrix (Random Forest):
[[837   0]
 [ 14  21]]

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       837
           1       1.00      0.60      0.75        35

    accuracy                           0.98       872
   macro avg       0.99      0.80      0.87       872
weighted avg       0.98      0.98      0.98       872

Accuracy Score (Random Forest): 0.9839449541284404


In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_pca, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test_pca)

# Evaluate the model
print("Confusion Matrix (Gradient Boosting):")
print(confusion_matrix(y_test, y_pred_gb))

print("\nClassification Report (Gradient Boosting):")
print(classification_report(y_test, y_pred_gb))

print("Accuracy Score (Gradient Boosting):", accuracy_score(y_test, y_pred_gb))


Confusion Matrix (Gradient Boosting):
[[834   3]
 [ 11  24]]

Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       837
           1       0.89      0.69      0.77        35

    accuracy                           0.98       872
   macro avg       0.94      0.84      0.88       872
weighted avg       0.98      0.98      0.98       872

Accuracy Score (Gradient Boosting): 0.9839449541284404


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Assuming X_pca_df and y are already defined
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_df, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# Define the meta-model
meta_model = LogisticRegression(random_state=42)

# Define the stacking model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Train the stacking model
stacking_model.fit(X_train_pca, y_train)

# Make predictions
y_pred_stack = stacking_model.predict(X_test_pca)

# Evaluate the model
print("Confusion Matrix (Stacking):")
print(confusion_matrix(y_test, y_pred_stack))

print("\nClassification Report (Stacking):")
print(classification_report(y_test, y_pred_stack))

print("Accuracy Score (Stacking):", accuracy_score(y_test, y_pred_stack))


Confusion Matrix (Stacking):
[[833   4]
 [  9  26]]

Classification Report (Stacking):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       837
           1       0.87      0.74      0.80        35

    accuracy                           0.99       872
   macro avg       0.93      0.87      0.90       872
weighted avg       0.98      0.99      0.98       872

Accuracy Score (Stacking): 0.9850917431192661


In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# Define hyperparameter grids
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20]
}

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1]
}

# Initialize base models
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Perform grid search for RandomForest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train_pca, y_train)

# Perform grid search for GradientBoosting
grid_search_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, cv=5, n_jobs=-1)
grid_search_gb.fit(X_train_pca, y_train)

# Get the best models
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_


In [21]:
# Define the meta-model
meta_model = LogisticRegression(random_state=42)

# Define the stacking model with the best base models
stacking_model = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb)],
    final_estimator=meta_model,
    cv=5
)

# Train the stacking model
stacking_model.fit(X_train_pca, y_train)


In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions
y_pred_stack = stacking_model.predict(X_test_pca)

# Evaluate
print("Confusion Matrix (Stacking):")
print(confusion_matrix(y_test, y_pred_stack))

print("\nClassification Report (Stacking):")
print(classification_report(y_test, y_pred_stack))

print("Accuracy Score (Stacking):", accuracy_score(y_test, y_pred_stack))


Confusion Matrix (Stacking):
[[835   2]
 [  9  26]]

Classification Report (Stacking):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       837
           1       0.93      0.74      0.83        35

    accuracy                           0.99       872
   macro avg       0.96      0.87      0.91       872
weighted avg       0.99      0.99      0.99       872

Accuracy Score (Stacking): 0.9873853211009175


In [23]:
import pickle

# Save the stacking model
with open('stacking_model.pkl', 'wb') as model_file:
    pickle.dump(stacking_model, model_file)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [51]:
pip install nbconvert


Note: you may need to restart the kernel to use updated packages.


In [53]:
!jupyter nbconvert --to script Untitled.ipynb



[NbConvertApp] Converting notebook Untitled.ipynb to script
[NbConvertApp] Writing 9063 bytes to Untitled.py
