# 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")


## Load & Preview Data



In [None]:
df = pd.read_csv('your_dataset.csv')  

df.head()
df.info()
df.describe()
df.isna().sum().sort_values(ascending=False)
df_other = pd.read_csv('your_dataset.csv')  


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

## Merging

In [None]:
data = df.merge(
    df_other,
    left_on='id_guest_anon',
    right_on='id_user_anon',
    how='left',  # or 'inner' if you want only matched rows
    suffixes=('', '_user')  # To avoid name clashes
)
print(data.head())

## feature enginearing

In [None]:
def feature
data = data.where(data['ds_checkin_first'] < data['ds_checkout_first'])
data = data.where(
    ~(
        data['id_guest_anon'].isna() |
        data['id_listing_anon'].isna() |
        data['id_host_anon'].isna()
    )
)
datetime_cols = [
    'ts_interaction_first', 'ts_reply_at_first',
    'ts_accepted_at_first', 'ts_booking_at',
    'ds_checkin_first', 'ds_checkout_first'
]

for col in datetime_cols:
    data[col] = pd.to_datetime(data[col], errors='coerce')

print(data.head())

In [None]:
data['response_time'] = (
    data['ts_reply_at_first'] - data['ts_interaction_first']
).dt.total_seconds()
data['was_booked'] = data['ts_booking_at'].notna().astype(int)
data['stay_length'] = (
    data['ds_checkout_first'] - data['ds_checkin_first']
).dt.days
data['job_type_encoded'] = data['job_type'].cat.codes


In [None]:
plt.figure(figsize=figsize)
sns.violinplot(data=data, x=x_col, y=y_col, inner='quartile', palette='muted')
plt.title(title or f'Violin Plot of {y_col} by {x_col}', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Imput, Scale Transforme data

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Feature Engineering
numerical_features = ['Pclass', 'Age', 'FamilySize','IsAlone', 'AgeGroup','Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
    ])


scaler = StandardScaler()
df[['FamilySize', 'AgeGroup', 'FareGroup']] = scaler.fit_transform(df[['FamilySize', 'AgeGroup', 'FareGroup']])


## Forecasting

In [None]:
y = df.pop('Survived')
X = df

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import  MultinomialNB, BernoulliNB,GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif

_selection = SelectKBest(score_func=f_classif, k=10)
_selection.fit(X_train, y_train)
X_train_selected = _selection.transform(X_train)

pipelines_set = {
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]), 
    'Naive Bayes': Pipeline([
        ('classifier', GaussianNB())
    ]),
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])
}

In [None]:


X_train = X_train.drop(columns=['Title', 'FareGroup'], errors='ignore')
X_test = X_test.drop(columns=['Title', 'FareGroup'], errors='ignore')

results = {}
for model_name, pipeline in pipelines_set.items():
    grid_search = GridSearchCV(pipeline, param_grid=param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    results[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")
    

for model_name, model in results.items():
    print(f"\nEvaluating {model_name} on the test set...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name} on test set: {accuracy:.4f}")



In [None]:
best_tree_model = results['Decision Tree']

if isinstance(best_tree_model.named_steps['classifier'], DecisionTreeClassifier):
    feature_importances = best_tree_model.named_steps['classifier'].feature_importances_
    
    features = X_train.columns 
    
    feature_importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print(feature_importance_df)

    important_features = feature_importance_df[feature_importance_df['Importance'] > 0.01]['Feature'].values
    print(f"\nImportant features selected: {important_features}")
