In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def preprocess_data(df):
    df["Description"] = df["About the city (long Description)"].fillna("") + " " + df["Best Time to visit"].fillna("")
    
    # Convert "Best Months" to a list of integers
    df["Best Months"] = df["Best Months"].fillna("").apply(lambda x: [int(m) for m in x.split(",") if m.isdigit()])

    # Remove cities that appear only once to prevent stratification errors
    city_counts = df["City"].value_counts()
    df = df[df["City"].isin(city_counts[city_counts > 1].index)]
    
    return df["Description"], df["City"], df

def balance_data(X, y):
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(pd.DataFrame(X), y)
    return X_resampled.squeeze(), y_resampled

def train_models(descriptions, cities, best_months):
    # Encode Best Months as a feature using one-hot encoding
    mlb = MultiLabelBinarizer()
    best_months_encoded = pd.DataFrame(mlb.fit_transform(best_months), columns=mlb.classes_)

    # Combine text descriptions with month features
    combined_features = descriptions + " " + best_months_encoded.astype(str).apply(" ".join, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(combined_features, cities, test_size=0.2, random_state=42, stratify=cities)
    X_train, y_train = balance_data(X_train, y_train)
    
    tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
    
    nb_model = make_pipeline(tfidf, MultinomialNB(alpha=0.1))
    dt = DecisionTreeClassifier()
    
    param_grid = {
        'decisiontreeclassifier__max_depth': [10, 15, 20],
        'decisiontreeclassifier__min_samples_split': [2, 5, 10]
    }
    
    dt_model = make_pipeline(tfidf, GridSearchCV(dt, param_grid, cv=5, scoring='accuracy'))
    
    nb_model.fit(X_train, y_train)
    dt_model.fit(X_train, y_train)
    
    ensemble_model = VotingClassifier(estimators=[('nb', nb_model), ('dt', dt_model)], voting='hard')
    ensemble_model.fit(X_train, y_train)
    
    print("\nNaïve Bayes Classification Report:")
    print(classification_report(y_test, nb_model.predict(X_test)))
    
    print("\nDecision Tree Classification Report:")
    print(classification_report(y_test, dt_model.predict(X_test)))
    
    print("\nEnsemble Model Classification Report:")
    print(classification_report(y_test, ensemble_model.predict(X_test)))
    
    return nb_model, dt_model, ensemble_model, mlb

def recommend_city(nb_model, dt_model, ensemble_model, mlb, df):
    description = input("Enter a description of the city you're looking for: ").strip()
    if not description:
        print("Invalid input. Please enter a valid description.")
        return
    
    try:
        month = int(input("Enter the month of travel (1-12, or 0 to ignore): ").strip() or 0)
    except ValueError:
        month = 0

    month_feature = " ".join(map(str, mlb.transform([[month]])[0])) if month in range(1, 13) else ""

    input_text = description + " " + month_feature

    nb_pred = nb_model.predict([input_text])[0]
    dt_pred = dt_model.predict([input_text])[0]
    ensemble_pred = ensemble_model.predict([input_text])[0]
    
    print("\nPredicted Cities:")
    print(f"Naïve Bayes: {nb_pred}")
    print(f"Decision Tree: {dt_pred}")
    print(f"Ensemble Model: {ensemble_pred}")

    # Get all matching cities from the dataset
    predicted_cities = {nb_pred, dt_pred, ensemble_pred}
    recommendations = df[df["City"].isin(predicted_cities)].copy()

    # Filter by month if provided
    if month in range(1, 13):
        recommendations = recommendations[recommendations["Best Months"].apply(lambda x: month in x)]

    # Sort by rating (descending)
    recommendations = recommendations.sort_values(by="Rating", ascending=False)

    if not recommendations.empty:
        print("\nRecommended Cities:")
        print(recommendations[["City", "Rating", "Best Time to visit"]])
    else:
        print("\nNo matching cities found for the given description and month.")

# Load and preprocess the dataset
file_path = "holidify.csv"
df = pd.read_csv(file_path)
descriptions, cities, df = preprocess_data(df)

nb_model, dt_model, ensemble_model, mlb = train_models(descriptions, cities, df["Best Months"])
recommend_city(nb_model, dt_model, ensemble_model, mlb, df)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/holidify.csv'