In [None]:
import pandas as pd

df = pd.read_csv('asset.csv')

df.head()

In [None]:
df['brand'].value_counts()

In [None]:
df['brand'] = df['brand'].str.lower()

In [None]:
df['brand'].value_counts()

In [None]:
df['item'].value_counts()

In [None]:
df.info()

In [None]:
df = df[~df['item'].isin(['SSD', 'VGA CARD'])]
df['item'].value_counts()

In [None]:
df_valid = df[df['serial_number'].notna()].copy()
print(f"‚úÖ Valid serial numbers: {len(df_valid)} rows")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# STEP 1: DATA LOADING & CLEANING
# ============================================================================
print("\nüìÇ [1/6] Loading and Cleaning Data...")
print(f"‚úÖ Loaded {len(df)} records")

# Remove rows with missing critical data
df_clean = df[df['serial_number'].notna() & df['item'].notna()].copy()
print(f"‚úÖ After removing NaN: {len(df_clean)} records")

# Handle duplicates
duplicates = df_clean[df_clean.duplicated(subset=['serial_number'], keep=False)]
print(f"‚ö†Ô∏è  Found {len(duplicates)} duplicate serial numbers")

# Keep first occurrence of duplicates
df_clean = df_clean.drop_duplicates(subset=['serial_number'], keep='first')
print(f"‚úÖ After deduplication: {len(df_clean)} records")

# Recalculate features after truncation
def extract_features(sn):
    sn = str(sn).strip()
    return {
        'length': len(sn),
        'digit_count': sum(c.isdigit() for c in sn),
        'letter_count': sum(c.isalpha() for c in sn),
        'first_3_chars': sn[:3].upper() if len(sn) >= 3 else sn.upper(),
        'first_char': sn[0].upper() if len(sn) > 0 else '',
        'digit_ratio': sum(c.isdigit() for c in sn) / len(sn) if len(sn) > 0 else 0
    }

# Re-extract features for affected rows
features_updated = df_clean['serial_number'].apply(extract_features)
features_df_updated = pd.DataFrame(features_updated.tolist())

# Update only the feature columns
feature_cols = ['length', 'digit_count', 'letter_count', 'first_3_chars', 'first_char', 'digit_ratio']
df_clean[feature_cols] = features_df_updated[feature_cols]

# Clean Docking Station anomalies (length > 40)
docking_anomalies = df_clean[(df_clean['item'] == 'DOCKING') & (df_clean['length'] > 40)]
print(f"‚ö†Ô∏è  Found {len(docking_anomalies)} Docking Station anomalies (length > 40)")

# Truncate long serial numbers to first 20 characters
df_clean.loc[df_clean['length'] > 40, 'serial_number'] = df_clean.loc[
    df_clean['length'] > 40, 'serial_number'
].str[:20]

print(f"‚úÖ Data cleaned successfully!")

In [None]:
# ============================================================================
# STEP 2: RULE-BASED CLASSIFIER (Baseline)
# ============================================================================
print("\nüßÆ [2/6] Building Rule-Based Classifier...")

def rule_based_predict(row):
    """Rule-based prediction based on prefix patterns"""
    prefix = row['first_3_chars']
    length = row['length']
    
    if not isinstance(prefix, str):
        prefix = str(prefix) if pd.notna(prefix) else ''
    
    # Monitor patterns
    if prefix in ['CN0', 'TH0', 'CNC', 'CNK', '6CM']:
        return 'MONITOR'
    
    # Desktop patterns
    if prefix in ['SGH', '1CZ', '4CE']:
        return 'DESKTOP'
    
    # Laptop patterns
    if prefix.startswith('5C') or prefix in ['CNU', 'DXN', 'HS7']:
        return 'LAPTOP'
    
    # Headset patterns (2XX series)
    if prefix.startswith('2') and length == 12:
        if prefix in ['203', '210']:
            return 'EARPHONE'
        else:
            return 'HEADSET'
    
    # Docking patterns
    if 'brand' in row and row['brand'] == 'dell' and prefix == 'CN0':
        # CN0 could be monitor or docking, check length
        if length > 20:
            return 'DOCKING'
    
    # Default: use length heuristic
    if length <= 8:
        return 'LAPTOP'
    elif length <= 11:
        return 'DESKTOP'
    elif length == 12:
        return 'HEADSET'
    elif length > 20:
        return 'DOCKING'
    else:
        return 'MONITOR'
        
df_clean['rule_based_prediction'] = df_clean.apply(rule_based_predict, axis=1)

# Evaluate rule-based model
rb_accuracy = accuracy_score(df_clean['item'], df_clean['rule_based_prediction'])
print(f"‚úÖ Rule-Based Accuracy: {rb_accuracy:.2%}")

In [None]:
# ============================================================================
# STEP 3: FEATURE ENGINEERING FOR ML
# ============================================================================
print("\nüîß [3/6] Feature Engineering for ML Model...")

# Encode categorical features
le_prefix = LabelEncoder()
le_first_char = LabelEncoder()

df_clean['prefix_encoded'] = le_prefix.fit_transform(df_clean['first_3_chars'])
df_clean['first_char_encoded'] = le_first_char.fit_transform(df_clean['first_char'])

# Select features for ML model
feature_columns = [
    'length', 'digit_count', 'letter_count', 'digit_ratio',
    'prefix_encoded', 'first_char_encoded'
]

# Add brand if available
if 'brand' in df_clean.columns:
    df_clean['brand_filled'] = df_clean['brand'].fillna('UNKNOWN')
    le_brand = LabelEncoder()
    df_clean['brand_encoded'] = le_brand.fit_transform(df_clean['brand_filled'])
    feature_columns.append('brand_encoded')

X = df_clean[feature_columns]
y = df_clean['item']

print(f"‚úÖ Features prepared: {feature_columns}")
print(f"‚úÖ Dataset shape: {X.shape}")

In [None]:
# ============================================================================
# STEP 4: TRAIN RANDOM FOREST MODEL
# ============================================================================
print("\nüå≤ [4/6] Training Random Forest Classifier...")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úÖ Training set: {len(X_train)} samples")
print(f"‚úÖ Test set: {len(X_test)} samples")

# Train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"\n‚úÖ Random Forest Accuracy: {rf_accuracy:.2%}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))

print("\nüéØ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm, 
                   index=rf_model.classes_, 
                   columns=rf_model.classes_))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n‚≠ê Feature Importance:")
print(feature_importance)

In [None]:
# ============================================================================
# STEP 5: SAVE MODEL & ARTIFACTS
# ============================================================================
print("\nüíæ [5/6] Saving Model and Artifacts...")

# Save model
joblib.dump(rf_model, 'asset_predictor_model.pkl')
print("‚úÖ Model saved: asset_predictor_model.pkl")

# Save encoders
joblib.dump(le_prefix, 'encoder_prefix.pkl')
joblib.dump(le_first_char, 'encoder_first_char.pkl')
if 'brand' in df_clean.columns:
    joblib.dump(le_brand, 'encoder_brand.pkl')
print("‚úÖ Encoders saved")

# Save feature columns
joblib.dump(feature_columns, 'feature_columns.pkl')
print("‚úÖ Feature config saved")

# Save cleaned dataset
df_clean.to_csv('asset_cleaned.csv', index=False)
print("‚úÖ Cleaned dataset saved: asset_cleaned.csv")

In [None]:
# ============================================================================
# STEP 6: CREATE PREDICTION FUNCTION
# ============================================================================
print("\nüîÆ [6/6] Creating Prediction API...")

def predict_asset_type(serial_number, brand=None):
    # Extract features
    sn = str(serial_number).strip()
    features = extract_features(sn)
    
    # Prepare input
    input_data = {
        'length': features['length'],
        'digit_count': features['digit_count'],
        'letter_count': features['letter_count'],
        'digit_ratio': features['digit_ratio'],
        'prefix_encoded': le_prefix.transform([features['first_3_chars']])[0] 
                          if features['first_3_chars'] in le_prefix.classes_ 
                          else 0,
        'first_char_encoded': le_first_char.transform([features['first_char']])[0]
                              if features['first_char'] in le_first_char.classes_
                              else 0
    }
    
    if 'brand_encoded' in feature_columns and brand:
        brand_clean = brand if brand in le_brand.classes_ else 'UNKNOWN'
        input_data['brand_encoded'] = le_brand.transform([brand_clean])[0]
    elif 'brand_encoded' in feature_columns:
        input_data['brand_encoded'] = le_brand.transform(['UNKNOWN'])[0]
    
    # Create DataFrame
    X_input = pd.DataFrame([input_data])[feature_columns]
    
    # Predict
    prediction = rf_model.predict(X_input)[0]
    probabilities = rf_model.predict_proba(X_input)[0]
    confidence = max(probabilities)
    
    # Get rule-based prediction for comparison
    rule_pred = rule_based_predict(pd.Series({
        'first_3_chars': features['first_3_chars'],
        'length': features['length'],
        'brand': brand
    }))
    
    return {
        'serial_number': serial_number,
        'predicted_item': prediction,
        'confidence': f"{confidence:.2%}",
        'rule_based_prediction': rule_pred,
        'prefix': features['first_3_chars'],
        'probabilities': dict(zip(rf_model.classes_, probabilities))
    }

# Save prediction function
joblib.dump(predict_asset_type, 'prediction_function.pkl')
print("‚úÖ Prediction function saved")

# ============================================================================
# DEMO PREDICTIONS
# ============================================================================
print("\n" + "=" * 70)
print("üéØ DEMO PREDICTIONS")
print("=" * 70)

# Test with real examples from dataset
test_samples = [
    ('CN0ABCD12345', 'dell'),
    ('SGH1234567', None),
    ('5CG7890ABC', None),
    ('251123456789', 'logitech'),
]

for sn, brand in test_samples:
    result = predict_asset_type(sn, brand)
    print(f"\nüìå Serial Number: {result['serial_number']}")
    print(f"   Predicted: {result['predicted_item']} (Confidence: {result['confidence']})")
    print(f"   Rule-Based: {result['rule_based_prediction']}")
    print(f"   Prefix: {result['prefix']}")

print("\n" + "=" * 70)
print("‚úÖ SYSTEM READY!")
print("=" * 70)
print("\nüì¶ Generated Files:")
print("   1. asset_predictor_model.pkl - Trained model")
print("   2. encoder_*.pkl - Feature encoders")
print("   3. feature_columns.pkl - Feature configuration")
print("   4. prediction_function.pkl - Prediction API")
print("   5. asset_cleaned.csv - Cleaned dataset")
print("\nüí° Usage:")
print("   from prediction_api import predict_asset_type")
print("   result = predict_asset_type('CN0ABC123', 'dell')")
print("   print(result['predicted_item'])")