In [3]:
# 📦 improved_modeling.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ✅ Load the cleaned dataset
file_path = '../data/processed_data/apartment_cluster_labeled.csv'
df = pd.read_csv(file_path)

# ✅ Drop nulls if any
if df['Cluster Label'].isnull().any():
    df = df.dropna(subset=['Cluster Label'])

# ✅ Feature and label
X = df['Job Description'].fillna("")
y = df['Cluster Label']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=1000)),
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# ✅ Grid search parameters
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

# ✅ Evaluation
y_pred = grid.predict(X_test)
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# ✅ Save model
joblib.dump(grid.best_estimator_, '../models/renovation_cluster_model.pkl')
print("\n✅ Tuned model saved to models/renovation_cluster_model.pkl")

Fitting 3 folds for each of 6 candidates, totalling 18 fits

✅ Classification Report:
                        precision    recall  f1-score   support

       Combining Units       0.95      0.95      0.95       657
    Egress & Occupancy       1.00      0.91      0.95        69
Interior Modifications       0.94      0.95      0.95       609
   Plumbing & Fixtures       0.96      0.96      0.96      1309
       Structural Work       0.96      0.94      0.95       481

              accuracy                           0.95      3125
             macro avg       0.96      0.94      0.95      3125
          weighted avg       0.95      0.95      0.95      3125

✅ Accuracy: 0.95424

✅ Tuned model saved to models/renovation_cluster_model.pkl


In [2]:
import sklearn
print(sklearn.__version__)

1.3.2
