In [22]:
import os
os.chdir("C:/Users/Work Test 2/Desktop/AI QA CAD")

In [23]:
import glob
print(glob.glob("ml/data/raw/*.features.csv"))


['ml/data/raw\\01262-24-PAD-R0P4.features.csv', 'ml/data/raw\\As-Built S.W.16-64-21-5_Rev0.features.csv', 'ml/data/raw\\FC667 Pad Site 14-33-60-19-5_Rev3.features.csv', 'ml/data/raw\\FC814 Pad Site 3-5-63-17-5_Rev1.features.csv', 'ml/data/raw\\Notification Plan 1-12-64-24-5_Rev2.features.csv', 'ml/data/raw\\PAD 2-28-64-3-6 Rev4.features.csv', 'ml/data/raw\\PAD 3-7-65-8-6_Rev3.features.csv', 'ml/data/raw\\Pad Site 1-28-61-20-5_Rev1 Prelim3.features.csv', 'ml/data/raw\\Pad Site 1-28-61-20-5_Rev1.features.csv', 'ml/data/raw\\Pad Site 10-12-66-6-6_Rev0 Prelim5.features.csv', 'ml/data/raw\\PAD SITE 10-27-64-3-6_Rev8.features.csv', 'ml/data/raw\\Pad Site 3-29-64-2-6_Rev0.features.csv', 'ml/data/raw\\Pad Site 6-1-66-4-6_Rev1.features.csv', 'ml/data/raw\\Pad Site 6-10-81-13-5_Rev5.features.csv', 'ml/data/raw\\Pad Site 8-1-44-7-5 Rev0.features.csv', 'ml/data/raw\\Pad Site 9-20-66-3-6_Rev1.features.csv', 'ml/data/raw\\PLA 9-35-65-3-6 -to 10-19-65-2-6_Rev0.features.csv', 'ml/data/raw\\PLA KRN 16-

# Layer Classification Model
This notebook loads feature CSVs generated from CAD drawings, trains a simple classifier to predict the appropriate layer for text entities based on their properties, and saves the trained model.

In [24]:
from pathlib import Path
import pandas as pd

data_dir   = Path("ml/data/raw")
csv_files  = list(data_dir.glob("*.features.csv"))
print("📄 files:", len(csv_files))

frames = []
for f in csv_files:
    df = pd.read_csv(f, on_bad_lines="skip")
    df["source_file"] = f.name      # 🟢 <-- KEEP THIS LINE!
    frames.append(df)

# merge once, not twice
features = pd.concat(frames, ignore_index=True)
features = features.dropna(subset=["TextString", "Layer"])
print("🟢 rows used:", len(features))


📄 files: 23
🟢 rows used: 7861


In [25]:
# Example preprocessing: use simple heuristics for demo
# Here we will treat 'TextString' as the only feature and the target is 'Layer'
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = features['TextString']
y = features['Layer']

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))


                     precision    recall  f1-score   support

                  0       0.77      0.78      0.77        68
                 35       1.00      1.00      1.00         3
    ABADATA-DRILL-T       0.91      0.96      0.94        76
         AS-DRILL-T       0.94      0.98      0.96        49
           AS-FENCE       0.00      0.00      0.00         2
         AS-GENERAL       0.75      0.43      0.55         7
         AS-PIPE-UG       0.00      0.00      0.00         2
       AS-PIPE-UG-T       1.00      1.00      1.00         2
         AS-POWER-T       0.00      0.00      0.00         1
       AS-RESIDENCE       0.00      0.00      0.00         3
      AS-TARGET-GAS       1.00      1.00      1.00        30
      AS-TARGET-OIL       1.00      1.00      1.00         5
AS-TRAPPER BOUNDARY       0.00      0.00      0.00         2
        AS-WILDLIFE       1.00      0.67      0.80         3
         AUX-BUFFER       0.88      1.00      0.93         7
             C-AR-T    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [26]:
# Save the trained model
import joblib
import os
os.makedirs('ml/artifacts', exist_ok=True)
joblib.dump(model, 'ml/artifacts/layer_clf.pkl')
print('Model saved to ml/artifacts/layer_clf.pkl')


Model saved to ml/artifacts/layer_clf.pkl


In [27]:
features["Layer"].value_counts()


Layer
L-BD              1091
L-SECTION-LSD      538
L-BD1              482
T-TIE-T            472
RoadSeg            444
                  ... 
T-WATER              1
L-AREAWORK           1
_.undo               1
P-FNC BUFFER-T       1
C-FLOWLINE-T         1
Name: count, Length: 93, dtype: int64

In [28]:
# --- Quick sanity‑checks ----------------------------------------------
# 1) How many rows did we import (before dropna)?
rows_before = sum(
    (sum(1 for _ in open(f, encoding="utf‑8")) - 1)   # lines per file minus header
    for f in csv_files
)
print("Rows BEFORE dropna:", rows_before)

# 2) How many rows remained?
print("Rows AFTER  dropna:", features.shape[0])
print("Rows DROPPED      :", rows_before - features.shape[0])

# 3) Missing fields count
print("Missing Layer      :", features["Layer"].isna().sum())
print("Missing TextString :", features["TextString"].isna().sum())

# 4) Rows kept per file (top 10)
if "source_file" in features.columns:
    rows_per_file = (
        features.groupby("source_file")
                .size()
                .sort_values(ascending=False)
    )
    print("\nRows kept per file (top 10):")
    print(rows_per_file.head(10))
    print("Total files used:", rows_per_file.shape[0])
    print("Total rows kept :", rows_per_file.sum())
else:
    print("\n⚠️  'source_file' column missing – re‑run the import cell above.")


Rows BEFORE dropna: 7989
Rows AFTER  dropna: 7861
Rows DROPPED      : 128
Missing Layer      : 0
Missing TextString : 0

Rows kept per file (top 10):
source_file
01262-24-PAD-R0P4.features.csv                            1418
PLA KRN 16-35-65-3-6 to 16-25-65-3-6_Rev2.features.csv     826
SML2 7-52-6-5 to 32-51-6-5_Rev3 Final.features.csv         786
FC814 Pad Site 3-5-63-17-5_Rev1.features.csv               620
Pad Site 6-1-66-4-6_Rev1.features.csv                      512
PAD SITE 10-27-64-3-6_Rev8.features.csv                    499
As-Built S.W.16-64-21-5_Rev0.features.csv                  428
S22830 FC658 Pad Site 3-5-62-19-5_Rev6.features.csv        330
FC667 Pad Site 14-33-60-19-5_Rev3.features.csv             329
Pad Site 6-10-81-13-5_Rev5.features.csv                    311
dtype: int64
Total files used: 23
Total rows kept : 7861
