In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import logging
from pathlib import Path
import joblib
import warnings
import os
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def get_project_root():
    """Get the project root directory."""
    try:
        # Try to get the directory containing the script
        script_dir = os.path.dirname(os.path.abspath(__file__))
        # If we're in the notebooks directory, go up one level
        if os.path.basename(script_dir) == 'notebooks':
            return os.path.dirname(script_dir)
        return script_dir
    except NameError:
        # If running in notebook, use current directory
        current_dir = os.getcwd()
        # If we're in the notebooks directory, go up one level
        if os.path.basename(current_dir) == 'notebooks':
            return os.path.dirname(current_dir)
        return current_dir

def calculate_entropy(password):
    """Calculate Shannon entropy of a password."""
    if not password:
        return 0.0
    
    freq = {}
    for char in password:
        freq[char] = freq.get(char, 0) + 1
    
    entropy = 0
    for count in freq.values():
        probability = count / len(password)
        entropy -= probability * np.log2(probability)
    
    return entropy

def extract_features(password):
    """Extract features from a password."""
    features = {
        'length': len(password),
        'lowercase': sum(1 for c in password if c.islower()),
        'uppercase': sum(1 for c in password if c.isupper()),
        'digits': sum(1 for c in password if c.isdigit()),
        'special': sum(1 for c in password if not c.isalnum()),
        'entropy': calculate_entropy(password)
    }
    return features

def load_and_process_data():
    """Load and process all password datasets."""
    # Get the project root directory
    project_root = get_project_root()
    logger.info(f"Project root directory: {project_root}")
    
    # Load all datasets
    datasets = {
        'very_weak': os.path.join(project_root, 'data', 'raw', 'pwlds_very_weak.csv'),
        'weak': os.path.join(project_root, 'data', 'raw', 'pwlds_weak.csv'),
        'average': os.path.join(project_root, 'data', 'raw', 'pwlds_average.csv'),
        'strong': os.path.join(project_root, 'data', 'raw', 'pwlds_strong.csv'),
        'very_strong': os.path.join(project_root, 'data', 'raw', 'pwlds_very_strong.csv')
    }
    
    all_data = []
    for strength, file_path in datasets.items():
        try:
            logger.info(f"Attempting to load: {file_path}")
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                # Rename columns to match our expected format
                df = df.rename(columns={'Password': 'password', 'Strength_Level': 'strength_level'})
                df['strength'] = strength
                all_data.append(df)
                logger.info(f"Loaded {len(df)} passwords from {file_path}")
            else:
                logger.error(f"File does not exist: {file_path}")
        except Exception as e:
            logger.error(f"Error loading {file_path}: {str(e)}")
    
    if not all_data:
        logger.error("No datasets were loaded successfully")
        return None
    
    # Combine all datasets
    df = pd.concat(all_data, ignore_index=True)
    logger.info(f"Total dataset size: {len(df)} passwords")
    
    # Extract features
    features_list = []
    for password in df['password']:
        features = extract_features(password)
        features_list.append(features)
    
    # Convert to DataFrame
    features_df = pd.DataFrame(features_list)
    features_df['strength'] = df['strength']
    
    logger.info(f"Extracted features for {len(features_df)} passwords")
    return features_df

def train_model(features_df):
    """Train the password strength model."""
    # Prepare data for training
    X = features_df.drop('strength', axis=1)
    y = features_df['strength']
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    logger.info(f"Training set size: {len(X_train)}")
    logger.info(f"Test set size: {len(X_test)}")
    
    # Define parameter grid (reduced for faster training)
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5]
    }
    
    # Create and train model
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Print best parameters
    logger.info(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate model
    y_pred = grid_search.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return grid_search.best_estimator_

def save_model(model):
    """Save the trained model."""
    # Get the project root directory
    project_root = get_project_root()
    
    # Create models directory if it doesn't exist
    models_dir = os.path.join(project_root, 'models')
    Path(models_dir).mkdir(exist_ok=True)
    
    # Save the model
    model_path = os.path.join(models_dir, 'password_strength_model.joblib')
    joblib.dump(model, model_path)
    logger.info(f"Model saved successfully to {model_path}")

def main():
    """Main function to run the password analysis pipeline."""
    try:
        # Load and process data
        features_df = load_and_process_data()
        if features_df is None:
            return
        
        # Train model
        model = train_model(features_df)
        
        # Save model
        save_model(model)
        
        logger.info("Password analysis completed successfully")
        
    except Exception as e:
        logger.error(f"Error in main pipeline: {str(e)}")

if __name__ == "__main__":
    main() 

2025-05-17 21:28:20,963 - INFO - Project root directory: c:\Users\hp\Desktop\AI\SYSMDP
2025-05-17 21:28:20,964 - INFO - Attempting to load: c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_very_weak.csv
2025-05-17 21:28:20,972 - INFO - Loaded 5000 passwords from c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_very_weak.csv
2025-05-17 21:28:20,973 - INFO - Attempting to load: c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_weak.csv


2025-05-17 21:28:20,979 - INFO - Loaded 5000 passwords from c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_weak.csv
2025-05-17 21:28:20,980 - INFO - Attempting to load: c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_average.csv
2025-05-17 21:28:20,987 - INFO - Loaded 5000 passwords from c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_average.csv
2025-05-17 21:28:20,988 - INFO - Attempting to load: c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_strong.csv
2025-05-17 21:28:20,996 - INFO - Loaded 5000 passwords from c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_strong.csv
2025-05-17 21:28:20,996 - INFO - Attempting to load: c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_very_strong.csv
2025-05-17 21:28:21,005 - INFO - Loaded 5000 passwords from c:\Users\hp\Desktop\AI\SYSMDP\data\raw\pwlds_very_strong.csv
2025-05-17 21:28:21,007 - INFO - Total dataset size: 25000 passwords
2025-05-17 21:28:21,629 - INFO - Extracted features for 25000 passwords
2025-05-17 21:28:21,637 - INFO - Training set size: 20000



Classification Report:
              precision    recall  f1-score   support

     average       0.84      0.94      0.89       985
      strong       0.92      0.81      0.86       989
 very_strong       1.00      1.00      1.00      1005
   very_weak       1.00      1.00      1.00      1021
        weak       0.94      0.93      0.93      1000

    accuracy                           0.94      5000
   macro avg       0.94      0.94      0.94      5000
weighted avg       0.94      0.94      0.94      5000

