In [77]:
import pandas as pd
import numpy as np

In [78]:
main_data = pd.read_csv('LetsCheck.csv')

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd  # Make sure this is imported

# Important Features (Assigning Higher Weights)
feature_weights = {
    'Hair Fall Rate': 19,  # Very Important Feature
    'Genetics': 2,  # Important
    'Stress': 2,  # Important
    'Hormonal Changes': 2,  # Important
    'Sleep Disturbance': 2,  # Important
}

# Prepare Features and Target
X_risk = main_data.drop('Label', axis=1)  # Features
y_risk = main_data['Label']  # Target variable

# Apply feature importance weights (Multiply Selected Columns)
for feature, weight in feature_weights.items():
    if feature in X_risk.columns:
        X_risk[feature] = X_risk[feature] * weight  # Boost its value

# First split: 80% train, 20% temp (which will be split into validation and test)
X_train, X_temp, y_train, y_temp = train_test_split(X_risk , y_risk , test_size=0.2, random_state=42)

# Second split: 50% validation, 50% test from the temp set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize SMOTE (to handle class imbalance)
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE and preserve feature names
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)  # This is the key change

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the resampled training data
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

# Display the results for the validation set
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(val_class_report)

# Predict on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_class_report = classification_report(y_test, y_test_pred)

# Display the results for the test set
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_class_report)

# Show feature importances
importances = rf_classifier.feature_importances_
feature_importance_dict = dict(zip(X_risk.columns, importances))

print("Feature Importances:", feature_importance_dict)

Validation Accuracy: 0.8807
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      1127
           1       0.90      0.89      0.90      1514
           2       0.85      0.88      0.87       359

    accuracy                           0.88      3000
   macro avg       0.87      0.88      0.88      3000
weighted avg       0.88      0.88      0.88      3000

Test Accuracy: 0.8793
Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1107
           1       0.90      0.89      0.89      1496
           2       0.90      0.88      0.89       397

    accuracy                           0.88      3000
   macro avg       0.88      0.88      0.88      3000
weighted avg       0.88      0.88      0.88      3000

Feature Importances: {'Gender': 0.03559045576780508, 'Age': 0.1956302418289466, 'Hairline Pattern': 0.021359144237299916, 'Hai

In [80]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np


# Prepare Features and Target
X_age = main_data.drop(['Age'], axis=1)  # Features (assuming 'Age_Of_Onset' is your target)
y_age = main_data['Age']  # Target variable for age prediction



# First split: 80% train, 20% temp (which will be split into validation and test)
X_train_age, X_temp_age, y_train_age, y_temp_age = train_test_split(X_age, y_age, test_size=0.2, random_state=42)

# Second split: 50% validation, 50% test from the temp set
X_val_age, X_test_age, y_val_age, y_test_age = train_test_split(X_temp_age, y_temp_age, test_size=0.5, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(
    
)


# Train the regressor
rf_regressor.fit(X_train_age, y_train_age)

# Predict on the validation set
y_val_pred_age = rf_regressor.predict(X_val_age)

# Evaluate the model on the validation set
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)
val_r2 = r2_score(y_val, y_val_pred)

# Display the results for the validation set
print("Validation Set Metrics:")
print(f"Mean Squared Error: {val_mse:.2f}")
print(f"Root Mean Squared Error: {val_rmse:.2f}")
print(f"R² Score: {val_r2:.4f}")

# Calculate Mean Absolute Error



error = mean_absolute_error(y_val_age, y_val_pred_age)
print(f"Mean Absolute Error: {error}")

 
# Predict on the test set
y_test_pred_age= rf_regressor.predict(X_test_age)

# Evaluate the model on the test set
test_mse = mean_squared_error(y_test_age, y_test_pred_age)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test_age, y_test_pred_age)
test_mae = mean_absolute_error(y_test_age, y_test_pred_age)

# Display the results for the test set
print("\nTest Set Metrics:")
print(f"Mean Squared Error: {test_mse:.2f}")
print(f"Root Mean Squared Error: {test_rmse:.2f}")
print(f"R² Score: {test_r2:.4f}")
print(f"Mean Absolute Error: {test_mae:.2f}")

# Show feature importances
importances = rf_regressor.feature_importances_
feature_importance_dict = dict(zip(X_age.columns, importances))

# Sort and display feature importances
sorted_importances = dict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))
print("\nFeature Importances:")
for feature, importance in sorted_importances.items():
    print(f"{feature}: {importance:.4f}")

Validation Set Metrics:
Mean Squared Error: 0.16
Root Mean Squared Error: 0.40
R² Score: 0.6246
Mean Absolute Error: 4.084699210535965

Test Set Metrics:
Mean Squared Error: 51.12
Root Mean Squared Error: 7.15
R² Score: 0.6211
Mean Absolute Error: 4.13

Feature Importances:
Hair Fall Rate: 0.3791
Nutrition: 0.1981
Hairline Pattern: 0.1028
Gender: 0.0681
Label: 0.0447
Stress: 0.0436
Chemical Product Usage: 0.0428
Water Quality Issue: 0.0292
Genetics: 0.0261
Hair Care Habits: 0.0148
Smoking: 0.0139
Hormonal Changes: 0.0113
Food Habit: 0.0095
Sleep Disturbance: 0.0091
Past Chronic Illness: 0.0068


In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np

# Define parameter grid with corrected max_features values
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  # Removed 'auto' as it's no longer supported
}

# Initialize base Random Forest model
rf_base = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=2,
    error_score='raise'  # This will help identify any other potential issues
)

# Fit GridSearchCV using training data
grid_search.fit(X_train_age, y_train_age)

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_test_pred = best_model.predict(X_test_age)

# Calculate metrics
test_mse = mean_squared_error(y_test_age, y_test_pred_age)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test_age, y_test_pred_age)
test_mae = mean_absolute_error(y_test_age, y_test_pred_age)

# Print best parameters
print("\nBest Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

# Print test set metrics
print("\nTest Set Metrics:")
print(f"Mean Squared Error: {test_mse:.2f}")
print(f"Root Mean Squared Error: {test_rmse:.2f}")
print(f"R² Score: {test_r2:.4f}")
print(f"Mean Absolute Error: {test_mae:.2f}")

# Get and print feature importances
importances = best_model.feature_importances_
feature_importance_dict = dict(zip(X_test_age.columns, importances))
sorted_importances = dict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))

print("\nFeature Importances:")
for feature, importance in sorted_importances.items():
    print(f"{feature}: {importance:.4f}")

Fitting 5 folds for each of 675 candidates, totalling 3375 fits

Best Parameters:
max_depth: 30
max_features: None
min_samples_leaf: 1
min_samples_split: 5
n_estimators: 500

Test Set Metrics:
Mean Squared Error: 51.12
Root Mean Squared Error: 7.15
R² Score: 0.6211
Mean Absolute Error: 4.13

Feature Importances:
Hair Fall Rate: 0.3815
Nutrition: 0.1990
Hairline Pattern: 0.1043
Gender: 0.0660
Label: 0.0455
Stress: 0.0423
Chemical Product Usage: 0.0419
Water Quality Issue: 0.0301
Genetics: 0.0259
Hair Care Habits: 0.0144
Smoking: 0.0132
Hormonal Changes: 0.0104
Food Habit: 0.0096
Sleep Disturbance: 0.0094
Past Chronic Illness: 0.0065


In [85]:

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),  # Base classifier
    param_grid=param_grid_rf,
    cv=3,  # 3-fold cross-validation on the training set
    scoring='accuracy',  # Metric to optimize
    n_jobs=-1  # Use all available CPU cores
)

# Fit GridSearchCV on the resampled training data
grid_search_rf.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best score (from cross-validation on the training set)
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Cross-Validation Accuracy (on Training Set):", grid_search_rf.best_score_)

# Get the best model from GridSearchCV
best_rf_model = grid_search_rf.best_estimator_

# Evaluate the best model on the validation set
y_val_pred = best_rf_model.predict(X_val)
print("Validation Set Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# Evaluate the best model on the test set
y_test_pred = best_rf_model.predict(X_test)
print("Test Set Results:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Best Parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation Accuracy (on Training Set): 0.8961578400830738
Validation Set Results:
Accuracy: 0.887
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87      1127
           1       0.90      0.90      0.90      1514
           2       0.88      0.88      0.88       359

    accuracy                           0.89      3000
   macro avg       0.88      0.88      0.88      3000
weighted avg       0.89      0.89      0.89      3000

Confusion Matrix:
 [[ 982  124   21]
 [ 126 1364   24]
 [  19   25  315]]
Test Set Results:
Accuracy: 0.8816666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      1107
           1       0.89      0.89      0.89      1496
           2       0.93      0.

In [86]:

def get_user_input():
    user_input = {}
    
    print("Please enter the following details:")

    user_input['Gender'] = int(input("Gender (0: Female, 1: Male, 2: Other): "))
    user_input['Age'] = int(input("Current Age (e.g., 22, 23, etc.): "))
    user_input['Hairline Pattern'] = int(input("Hairline Pattern (0: Normal, 1: Receding, 2, 3, 4, 5: Other variations): "))
    user_input['Hair Fall Rate'] = float(input("Hair Fall Rate (e.g., 50 strands per day): "))
    user_input['Nutrition'] = float(input("Nutrition Level (1-10, 10 being excellent): "))
    user_input['Chemical Product Usage'] = int(input("Chemical Product Usage (0: No, 1, 2, 3: Varying levels of use): "))
    user_input['Genetics'] = int(input("Family History of Hair Loss (0: No, 1: Yes, 2: yes ): "))
    user_input['Past Chronic Illness'] = int(input("Past Chronic Illness (0: No, 1: Yes, 2: Severe): "))
    user_input['Sleep Disturbance'] = int(input("Sleep Disturbance (0: No, 1: Yes, 2: Severe issues): "))
    user_input['Water Quality Issue'] = int(input("Poor Water Quality (0: No, 1: Yes, 2: Very bad): "))
    user_input['Stress'] = int(input("Stress Levels (0: Low, 1: Medium, 2: High, 3: Extreme): "))
    user_input['Food Habit'] = int(input("Food Habit (0: Healthy, 1: Unhealthy, 2: Junk food, 3: Worst diet): "))
    user_input['Hormonal Changes'] = int(input("Hormonal Imbalance (0: No, 1: Yes): "))
    user_input['Hair Care Habits'] = int(input("Poor Hair Care Habits (0: No, 1: Yes): "))
    user_input['Smoking'] = int(input("Smoking (0: No, 1: Yes): "))

    return user_input


def adjust_age_based_on_features(user_input, predicted_age):
    # Adjusting the predicted age based on the high-importance features
    weight_factors = {
        "Chemical Product Usage": 1.4,  # This could be a factor to increase the age slightly
        "Genetics": 1.3,  # Higher weight for family history
        "Past Chronic Illness": 1.1,  # Slight adjustment if there's a history of illness
        "Sleep Disturbance": 1.2,  # Sleep disturbance increases the risk slightly
        "Water Quality Issue": 1.1,  # Water quality issue could also slightly increase age prediction
        "Stress": 1.2,  # Stress can accelerate hair fall, so adjust accordingly
        "Food Habit": 1.3  # Poor food habits could accelerate hair loss
    }
    
    adjusted_age = predicted_age
    for feature, weight in weight_factors.items():
        if user_input[feature] == 2:  # Higher value means worse factor (e.g., 2 for high usage or poor quality)
            adjusted_age += 1  # You can modify this factor based on how much influence you want to give each feature
    
    return adjusted_age

def predict_hair_fall_risk(model, scaler, X_train_resampled, y_train_resampled):
    # Get user input details
    user_input = get_user_input()  # Assume a function to collect user input as before

    # Define the feature order
    feature_order = ['Gender', 'Age', 'Hairline Pattern', 'Hair Fall Rate', 'Nutrition',
                     'Chemical Product Usage', 'Genetics', 'Past Chronic Illness',
                     'Sleep Disturbance', 'Water Quality Issue', 'Stress', 'Food Habit',
                     'Hormonal Changes', 'Hair Care Habits', 'Smoking']
    
    # Prepare the user input as a feature array
    user_data = np.array([[user_input[feature] for feature in feature_order]])
    
    # Scale user input if a scaler is provided
    if scaler:
        user_data = scaler.transform(user_data)

    # Make prediction for risk level
    prediction = model.predict(user_data)[0]
    
    # Map the prediction to risk level (0 = Low, 1 = Medium, 2 = High)
    risk_mapping = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
    risk_level = risk_mapping.get(prediction, "Unknown")

    # Now, using the 'Label' column from y_train_resampled to predict the age when hair loss might start
    user_age = user_input["Age"]
    
    if risk_level in ["High Risk", "Medium Risk"]:
        # Filter the dataset based on the predicted risk level (using y_train_resampled and X_train_resampled)
        risk_ages = X_train_resampled[y_train_resampled == prediction]['Age']
        
        # Calculate the average age for people in that risk category
        avg_age = risk_ages.mean()  # Average age of people who experienced the same risk level

        # For Medium Risk, calculate average first, then add 5 years
        if risk_level == "Medium Risk":
            predicted_age = (avg_age + user_age) / 2  # First calculate average
            predicted_age += 5  # Then add 5 years to the predicted age

        # For High Risk, calculate without any additional years
        elif risk_level == "High Risk":
            predicted_age = (avg_age + user_age) / 2

        # Adjust age based on important features
        adjusted_age = adjust_age_based_on_features(user_input, predicted_age)

        print(f"\nPredicted Hair Fall Risk: {risk_level}")
        print(f"Estimated Age When Hair Fall Might Start: {adjusted_age:.2f}")
        years = int(adjusted_age)
        months = round((adjusted_age - years) * 12)
        print(f"Estimated Age When Hair Fall Might Start: {years} years and {months} months")
    
    else:
        # For low risk, no prediction is required.
        print(f"\nPredicted Hair Fall Risk: {risk_level}")
        print("No significant risk predicted for hair fall. However, it's important to care for your hair health to avoid future problems.")
    return risk_level
        

In [87]:
def predict_hair_fall_risk_age(risk_model, scaler, X_train_resampled, y_train_resampled, best_model):
    # Get user input details
    user_input = get_user_input()

    # Define the feature order for risk prediction (including Age)
    feature_order_risk = ['Gender', 'Age', 'Hairline Pattern', 'Hair Fall Rate', 'Nutrition',
                          'Chemical Product Usage', 'Genetics', 'Past Chronic Illness',
                          'Sleep Disturbance', 'Water Quality Issue', 'Stress', 'Food Habit',
                          'Hormonal Changes', 'Hair Care Habits', 'Smoking']
    
    # Create DataFrame for risk prediction
    user_data_risk = pd.DataFrame([user_input], columns=feature_order_risk)
    
    # Make prediction for risk level
    risk_prediction = risk_model.predict(user_data_risk)[0]
    
    # Map the prediction to risk level (0 = Low, 1 = Medium, 2 = High)
    risk_mapping = {0: "Low Risk", 1: "Medium Risk", 2: "High Risk"}
    risk_level = risk_mapping.get(risk_prediction, "Unknown")
    
    if risk_level in ["High Risk", "Medium Risk"]:
        # Prepare input data for age prediction
        # Drop 'Age' (target variable) and add predicted 'Risk' as a feature
        age_features = user_data_risk.drop('Age', axis=1)
        age_features['Label'] = risk_prediction  # Add predicted risk as a feature
        
        # Ensure the feature order matches what was used during training of best_model
        if hasattr(best_model, 'feature_names_in_'):
            age_features = age_features[best_model.feature_names_in_]
        
        # Predict age using the best_model
        predicted_age = best_model.predict(age_features)[0]
        
        # Get feature importance for age prediction
        if hasattr(best_model, 'feature_importances_'):
            feature_importance = dict(zip(age_features.columns, best_model.feature_importances_))
            top_factors = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:5]
        else:
            top_factors = []
        
        # Print results
        print(f"\nPredicted Hair Fall Risk: {risk_level}")
        print(f"Estimated Age of Hair Loss Onset: {predicted_age:.1f} years")
        print("\nTop factors influencing the prediction:")
        for factor, importance in top_factors:
            print(f"- {factor}: {importance:.3f}")
    
    else:
        # For low risk, no prediction is required.
        print(f"\nPredicted Hair Fall Risk: {risk_level}")
        print("No significant risk predicted for hair fall. However, it's important to care for your hair health to avoid future problems.")
    
    return risk_level

# Product Recommendation

In [88]:
main_product = pd.read_csv('main_product.csv')

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define risk-level specific keywords


low_risk_keywords = [
    "shine enhancement", "scalp nourishment", "vitamin-rich",
    "gentle cleansing", "moisturizing", "uv protection", "split end repair", 
    "natural oils", "hydration", "softening", "frizz control", "shine & gloss", 
    "detangling", "smoothness", "manageability", "silky-smooth", 
    "non-sticky formula", "humidity control", "lightweight", "heat protection", 
    "color protection", "daily care", "split ends prevention", "anti-frizz", 
    "straightening & smoothening", "conditioning", "nourishment", "volumizer", 
    "styling"
]


medium_risk_keywords = [
    "thickening", "volume boost", "strengthening", "follicle stimulation", "anti-breakage", "scalp revitalization",
    "keratin repair", "protein treatment", "hair growth support", "reducing hair fall", "damage repair",
    "nourishment", "conditioning", "anti-frizz", "scalp health", "hair elasticity", "hair strength", "hair repair",
    "anti-hair fall",
    "growth stimulating"
]

high_risk_keywords = [
    "follicle regeneration", "hair growth", "hair fall prevention", "nourishing repair", "damaged hair repair",
    "restorative", "regrowth", "scalp repair", "hair loss", "thinning hair", "hair restoration",
    "hair rejuvenation", 
    "intensive care", 
      "split-ends",  
      "anti-hair fall",
     "growth stimulating",
    "hair revival", "regenerating serum", "hair reactivation" 
]

low_risk_keywords = list(set(low_risk_keywords))
medium_risk_keywords = list(set(medium_risk_keywords))
high_risk_keywords = list(set(high_risk_keywords))
1

def recommend_product(main_product):
    # Get user risk level (using the predict_hair_fall_risk function)
    risk_level = predict_hair_fall_risk(best_rf_model, None, X_train_resampled, y_train_resampled)  # Modify as needed to return the risk level directly

    # Preprocess product details (cleaning, tokenization, vectorization)
    product_details = main_product['Details']  # Assume this is the 'Details' column from your product dataset
    vectorizer = TfidfVectorizer(stop_words='english')
    product_vectors = vectorizer.fit_transform(product_details)

    # Select keywords based on risk level
    if risk_level == "High Risk":
        keywords = high_risk_keywords
    elif risk_level == "Medium Risk":
        keywords = medium_risk_keywords
    else:
        keywords = low_risk_keywords

    # Vectorize the user's condition based on the selected keywords
    user_condition_vector = vectorizer.transform(keywords)

    # Compute the cosine similarity between the user's keywords and product details
    similarity_scores = cosine_similarity(user_condition_vector, product_vectors)

    # Get the indices of the top 5 products based on the highest similarity scores
    avg_similarity_scores = similarity_scores.mean(axis=0)  # Calculate the average similarity for each product
    top_5_indices = np.argsort(avg_similarity_scores)[::-1][:5]  # Sort and get the top 5 products

    # Print details of the top 5 recommended products
    print(f"Top 5 Recommended Products for Your Risk Level: {risk_level}\n")
    
    for i, idx in enumerate(top_5_indices):
        recommended_product = main_product.iloc[idx]
        print(f"Rank {i+1}: {recommended_product['ProductsName']}")
        print(f"Cost: {recommended_product['Product Cost']}")
        print(f"Product Feedback: {recommended_product['Feedbacks']}")
        print('-' * 80)
    return top_5_indices



# Content Based

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

def get_content_recommendations(selected_product_name, product_data, n_recommendations=5):
    """
    Provides collaborative filtering based recommendations after user selects a product.
    
    Parameters:
    selected_product_name: Name of the product selected by user from initial recommendations
    product_data: DataFrame with columns [ProductsName, Product Cost, Feedbacks, Details]
    n_recommendations: Number of recommendations to return
    
    Returns:
    DataFrame containing recommended products
    """
    # Create item features matrix using Details column
    vectorizer = TfidfVectorizer(stop_words='english')
    item_features = vectorizer.fit_transform(product_data['Details'])
    
    # Calculate item-item similarity matrix
    item_similarity = cosine_similarity(item_features)
    item_similarity_df = pd.DataFrame(
        item_similarity,
        index=product_data['ProductsName'],
        columns=product_data['ProductsName']
    )
    
    # Get similar products
    similar_scores = item_similarity_df[selected_product_name].sort_values(ascending=False)
    similar_products = similar_scores.index[1:n_recommendations+1].tolist()
    
    # Get full details of recommended products
    recommendations = product_data[product_data['ProductsName'].isin(similar_products)].copy()
    
    # Add similarity score to recommendations
    recommendations['SimilarityScore'] = recommendations['ProductsName'].map(similar_scores)
    
    # Sort by similarity score
    recommendations = recommendations.sort_values('SimilarityScore', ascending=False)
    
    return recommendations

def display_collaborative_recommendations(recommendations):
    """
    Displays the collaborative filtering recommendations in a formatted way.
    
    Parameters:
    recommendations: DataFrame containing recommended products
    """
    print("\nBased on your selection, you might also like:\n")
    for idx, row in recommendations.iterrows():
        print(f"Product: {row['ProductsName']}")
        print(f"Cost: {row['Product Cost']}")
        print(f"Product Feedback: {row['Feedbacks']}")
        print(f"Similarity Score: {row['SimilarityScore']:.2f}")
        print('-' * 80)

def complete_recommendation_workflow(product_data):
    """
    Complete workflow combining initial risk-based recommendations and collaborative filtering.
    """
    # Get top 5 recommended product indices
    top_5_indices = recommend_product(product_data)  

    if not isinstance(top_5_indices, (list, np.ndarray)) or len(top_5_indices) == 0:
        print("No recommendations available. Please try again.")
        return
    
    # Get user selection
    while True:
        try:
            selected_rank = int(input(f"\nPlease select a product by entering its rank (1-{min(5, len(top_5_indices))}): "))
            if 1 <= selected_rank <= min(5, len(top_5_indices)):
                break
            print(f"Please enter a number between 1 and {min(5, len(top_5_indices))}.")
        except ValueError:
            print("Please enter a valid number.")

    # Get the selected product's name using `.iloc`
    selected_product_name = product_data.iloc[top_5_indices[selected_rank-1]]['ProductsName']

    # Get and display collaborative recommendations
    collab_recommendations = get_content_recommendations(selected_product_name, product_data)
    display_collaborative_recommendations(collab_recommendations)


# Hybrid 

In [96]:


def get_user_collaborative_recommendations(selected_product_name, product_data, n_recommendations=5):
    """
    Provides user-collaborative filtering recommendations based on product feedback patterns.
    
    Parameters:
    selected_product_name: Name of the product selected by user
    product_data: DataFrame with columns [ProductsName, Product Cost, Feedbacks, Details]
    n_recommendations: Number of recommendations to return
    
    Returns:
    DataFrame containing recommended products
    """
    # Create item-feedback matrix using the Feedbacks column
    vectorizer = TfidfVectorizer(stop_words='english')
    feedback_features = vectorizer.fit_transform(product_data['Feedbacks'].astype(str))
    
    # Calculate item-item similarity based on feedback patterns
    feedback_similarity = cosine_similarity(feedback_features)
    feedback_similarity_df = pd.DataFrame(
        feedback_similarity,
        index=product_data['ProductsName'],
        columns=product_data['ProductsName']
    )
    
    # Get similar products based on feedback patterns
    similar_scores = feedback_similarity_df[selected_product_name].sort_values(ascending=False)
    similar_products = similar_scores.index[1:n_recommendations+1].tolist()
    
    # Get full details of recommended products
    recommendations = product_data[product_data['ProductsName'].isin(similar_products)].copy()
    recommendations['CollaborativeScore'] = recommendations['ProductsName'].map(similar_scores)
    
    return recommendations.sort_values('CollaborativeScore', ascending=False)

def get_hybrid_recommendations(content_recommendations, collab_recommendations, product_data, weight_content=0.5):
    """
    Combines content-based and feedback-based recommendations.
    
    Parameters:
    content_recommendations: DataFrame with content-based recommendations
    collab_recommendations: DataFrame with feedback-based recommendations
    product_data: Original product data
    weight_content: Weight for content-based recommendations (0-1)
    
    Returns:
    DataFrame with hybrid recommendations
    """
    # Normalize similarity scores
    content_recommendations['NormalizedContentScore'] = (
        content_recommendations['SimilarityScore'] - content_recommendations['SimilarityScore'].min()
    ) / (content_recommendations['SimilarityScore'].max() - content_recommendations['SimilarityScore'].min())
    
    collab_recommendations['NormalizedCollabScore'] = (
        collab_recommendations['CollaborativeScore'] - collab_recommendations['CollaborativeScore'].min()
    ) / (collab_recommendations['CollaborativeScore'].max() - collab_recommendations['CollaborativeScore'].min())
    
    # Combine recommendations
    hybrid_recommendations = pd.concat([
        content_recommendations[['ProductsName', 'NormalizedContentScore']],
        collab_recommendations[['ProductsName', 'NormalizedCollabScore']]
    ], axis=0)
    
    # Calculate hybrid score
    hybrid_recommendations['HybridScore'] = hybrid_recommendations['NormalizedContentScore'].fillna(0) * weight_content + \
                                          hybrid_recommendations['NormalizedCollabScore'].fillna(0) * (1 - weight_content)
    
    # Get unique products with highest hybrid scores
    hybrid_recommendations = hybrid_recommendations.groupby('ProductsName')['HybridScore'].max().reset_index()
    hybrid_recommendations = hybrid_recommendations.sort_values('HybridScore', ascending=False)
    
    # Get full product details
    final_recommendations = product_data[product_data['ProductsName'].isin(hybrid_recommendations['ProductsName'])].copy()
    final_recommendations['HybridScore'] = final_recommendations['ProductsName'].map(
        hybrid_recommendations.set_index('ProductsName')['HybridScore']
    )
    
    return final_recommendations.sort_values('HybridScore', ascending=False)

def display_recommendations(recommendations, recommendation_type="Recommendations"):
    """
    Displays recommendations in a formatted way.
    
    Parameters:
    recommendations: DataFrame containing recommended products
    recommendation_type: String indicating the type of recommendations being displayed
    """
    print(f"\n{recommendation_type}:\n")
    for idx, row in recommendations.iterrows():
        print(f"Product: {row['ProductsName']}")
        print(f"Cost: {row['Product Cost']}")
        print(f"Product Feedback: {row['Feedbacks']}")
        score_type = 'HybridScore' if 'HybridScore' in row else \
                    'CollaborativeScore' if 'CollaborativeScore' in row else 'SimilarityScore'
        
        print('-' * 80)

def complete_hybrid_recommendation_workflow(product_data):
    """
    Complete workflow combining content-based, feedback-based, and hybrid recommendations.
    
    Parameters:
    user_input: User's initial input for content-based filtering
    product_data: DataFrame with product information
    """
    # Get initial content-based recommendations
    top_5_indices = recommend_product(product_data)
    
    if not isinstance(top_5_indices, (list, np.ndarray)) or len(top_5_indices) == 0:
        print("No recommendations available. Please try again.")
        return
    
    # Get user selection for content-based recommendations
    while True:
        try:
            selected_rank = int(input(f"\nPlease select a product by entering its rank (1-{min(5, len(top_5_indices))}): "))
            if 1 <= selected_rank <= min(5, len(top_5_indices)):
                break
            print(f"Please enter a number between 1 and {min(5, len(top_5_indices))}.")
        except ValueError:
            print("Please enter a valid number.")
    
    # Get selected product name
    selected_product_name = product_data.iloc[top_5_indices[selected_rank-1]]['ProductsName']
    
    # Get content-based recommendations
    content_recommendations = get_content_recommendations(selected_product_name, product_data)
    display_recommendations(content_recommendations, "Content-based Recommendations")
    
    # Get feedback-based recommendations
    collab_recommendations = get_user_collaborative_recommendations(selected_product_name, product_data)
    display_recommendations(collab_recommendations, "collaborative-based Recommendations")
    
    # Get hybrid recommendations
    hybrid_recommendations = get_hybrid_recommendations(content_recommendations, collab_recommendations, product_data)
    display_recommendations(hybrid_recommendations, "Hybrid Recommendations")


In [97]:
complete_hybrid_recommendation_workflow(main_product)

Please enter the following details:





Predicted Hair Fall Risk: Medium Risk
Estimated Age When Hair Fall Might Start: 33.46
Estimated Age When Hair Fall Might Start: 33 years and 6 months
Top 5 Recommended Products for Your Risk Level: Medium Risk

Rank 1: ENAUNIQ Shikakai With Ritha Shampoo EXTRA Conditioner For Nourishing Soft & Smooth Hair  (1000 ml)
Cost: 400.0
Product Feedback: Excellent :- Very nice product I love it thanks for flipkart 🥰😍🙏 :- Awesome
--------------------------------------------------------------------------------
Rank 2: look hair Onion Oil and Onion Shampoo  (500 ml)
Cost: 638.4000000000001
Product Feedback: Best products thank u flipcart :- Very Good product...Thank You Flipkart Excellent :- Good product
--------------------------------------------------------------------------------
Rank 3: Kamill Herbal Shampoo And Conditioner With Tulsi, Amla & Shikkakai  (500 ml)
Cost: 667.2
Product Feedback: Nice
--------------------------------------------------------------------------------
Rank 4: Khadi M

In [98]:
import joblib
joblib.dump(best_model, 'age_prediction_model_tuned_final.pkl')
joblib.dump(best_model, 'hairfall_risk_model_tuned_final.pkl')

['hairfall_risk_model_tuned_final.pkl']