# COMP 7103 C 
**Assignment**

**Group 4**

In [66]:
# 创建Conda虚拟环境并添加到jupyter notebook kernal中

# ! conda create -n random_forest python=3.10
# !pip install ipykernel
# !python -m ipykernel install --name random_forest

# 配置环境后，需要重启jupyter notebook
# !conda env list

In [67]:
# !pip install scikit-learn

In [81]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf

In [82]:
# 1. Read data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
validate_data = pd.read_csv('Validate.csv')

For almost all the attributes, there is a possible value `nan` which means `unavailable`. 

For numeric columns, fill `nan` values with the `mean`; for categorical columns, fill `nan` values with the `mode`.

In [84]:
# 2. Data preprocessing
def preprocess_data(df):
    df = df.copy()
    
    # Handle missing values
    numeric_columns = ['Age', 'Years_of_Working ', 'Family_Members']
    for col in numeric_columns:
        df[col] = df[col].fillna(df[col].mean()) 
    
    categorical_columns = ['Profession', 'Graduate', 'Gender', 'Married']
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Create LabelEncoder object
    le = LabelEncoder()
    
    # Convert string features to numeric
    encode_columns = ['Profession', 'Spending_Score', 'Category', 'Graduate', 'Gender', 'Married'] 
    for col in encode_columns:
        df[col] = le.fit_transform(df[col].astype(str))
    
    return df

In [85]:
# Check missing values before preprocessing
print("Missing values before preprocessing:")
print(train_data.isnull().sum())

# Apply preprocessing
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)
validate_data = preprocess_data(validate_data)

# Check missing values after preprocessing
print("\nMissing values after preprocessing:")
print(train_data.isnull().sum())

Missing values before preprocessing:
ID                     0
Married                0
Gender               130
Age                    0
Graduate              77
Profession           112
Years_of_Working     742
Spending_Score         0
Family_Members       310
Category              72
Class(Target)          0
dtype: int64

Missing values after preprocessing:
ID                   0
Married              0
Gender               0
Age                  0
Graduate             0
Profession           0
Years_of_Working     0
Spending_Score       0
Family_Members       0
Category             0
Class(Target)        0
dtype: int64


In [88]:
# Process training, test and validation data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)
validate_data = preprocess_data(validate_data)

In [89]:
# print(train_data.columns.tolist())

# 3. Prepare features and target variables
feature_columns = ['ID', 'Married', 'Gender', 'Age', 'Graduate', 'Profession',
                  'Years_of_Working ', 'Spending_Score', 'Family_Members', 'Category']

X_train = train_data[feature_columns]
y_train = train_data['Class(Target)']

X_test = test_data[feature_columns]
X_validate = validate_data[feature_columns]
y_validate = validate_data['Class(Target)']


Parametric grid search using GridSearchCV

In [90]:
from sklearn.model_selection import GridSearchCV

# Define a relatively small parameter grid to speed up the search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'class_weight': ['balanced']
}

# Create random forest model
rf = RandomForestClassifier(random_state=42)

# Create grid search object, set n_jobs=1 to avoid parallel processing issues
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=1,  # Set to 1 to avoid parallel processing issues
    scoring='accuracy',
    verbose=2
)

# Train the grid search model
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Create new model with best parameters
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train, y_train)

# Evaluate optimized model
validate_predictions = best_rf.predict(X_validate)
validate_accuracy = accuracy_score(y_validate, validate_predictions)
print("\nOptimized validation set accuracy:", validate_accuracy)
print("\nOptimized classification report:")
print(classification_report(y_validate, validate_predictions))

# Output feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': best_rf.feature_importances_
})
print("\nFeature importance:")
print(feature_importance.sort_values('importance', ascending=False))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_lea

#### Best parameter
`{'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}`

#### Best Score
`0.47923091398640627`

#### Optimized validation set accuracy
`0.4593077642656688`

#### Classification Report
|              | precision | recall | f1-score | support |
|--------------|-----------|---------|----------|---------|
| A            | 0.41      | 0.59    | 0849     | 241     |
| B            | 0.55      | 0.66    | 0.60     | 271     |
| C            |50.48      |40.12    220.19     | 269     |
| D            1 0.40      | 0.47    4 0.43     | 288     |
| accuracy     |           |         | 0.46     | 1069    |
| macro avg    | 0.46      | 0.46    | 0.43     | 1069    |
| weighted avg | 0.46      | 0.46    | 0.43     | 10.027794 |arried | 0.028010 |


#### Importance

| rank | feature | importance | 
|------|----------|------------|
|  1   | Age | 0.273644 |
|  2   | ID | 0.149208 |
|  3   | Profession | 0.143670 |
|  4   | Spending_Score | 0.088379 |
|  5   | Family_Members | 0.079100 |
|  6   | Years_of_Working | 0.079040 |
|  7   | Category | 0.059044 |
|  8   | Graduate | 0.054998 |
|  9   | Gender | 0.050621 |
|  10  | Married | 0.027794 |


Based on the observation above,`Age`、`Profession` and `Years_of_Working` are most import features. Thus, here to try some 
optimizationstrategy like feature engineering.
- Create interaction features
- Perform binning on continuous variables (`Years_of_Working` column contains many duplicate values, which prevents proper binning. This can be resolved by adding the 'duplicates' parameter to handle these duplicate values.)
- Apply better encoding methods for categorical variables

The `recall` rate of `C` category is low, so need to find a way to enhance it. 

In [80]:
def feature_engineering(X):
    X = X.copy()
    
    # 年龄分组
    X['Age_Group'] = pd.qcut(X['Age'], q=5, labels=['Very_Young', 'Young', 'Middle', 'Senior', 'Elder'])
    
    # 工作年限分组 - 使用自定义边界而不是等频分箱
    X['Experience_Level'] = pd.cut(X['Years_of_Working '], 
                                 bins=[0, 2, 5, 10, float('inf')],
                                 labels=['Entry', 'Junior', 'Mid', 'Senior'],
                                 include_lowest=True)
    
    # 创建交互特征
    X['Age_Experience_Ratio'] = X['Age'] / (X['Years_of_Working '] + 1)  # 避免除以0
    X['Spending_Per_Family'] = X['Spending_Score'] / X['Family_Members']
    
    # 对类别型变量进行独热编码
    cat_columns = ['Profession', 'Graduate', 'Age_Group', 'Experience_Level']
    X = pd.get_dummies(X, columns=cat_columns)
    
    return X

# 2. 重新定义参数网格
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 15, 20, 25, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', 'balanced_subsample']
}

# 3. 创建和训练模型
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=1,
    scoring='f1_macro',
    verbose=2
)

# 应用特征工程
X_train_engineered = feature_engineering(X_train)
X_validate_engineered = feature_engineering(X_validate)

# 训练模型
grid_search.fit(X_train_engineered, y_train)

# 4. 输出结果
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)

# 使用最佳参数创建新模型
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train_engineered, y_train)

# 评估优化后的模型
validate_predictions = best_rf.predict(X_validate_engineered)
validate_accuracy = accuracy_score(y_validate, validate_predictions)
print("\n优化后验证集准确率:", validate_accuracy)
print("\n优化后分类报告:")
print(classification_report(y_validate, validate_predictions))

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_l