In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random

In [2]:
def load_dataset(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)

In [3]:
def explore_dataset(df):
    """Explore the dataset by displaying the first few rows, summary statistics, and null values."""
    print(df.head())
    print(df.describe())
    print(df.isnull().sum())

In [4]:
def replace_missing_values(df):
    """Replace null values with mean for numeric columns and mode for non-numeric columns."""
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    non_numeric_cols = df.select_dtypes(exclude='number').columns
    df[non_numeric_cols] = df[non_numeric_cols].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else None))

    print(df.isnull().sum())
    return df

In [5]:
def perform_rfm_analysis(df):
    """Perform RFM analysis and create clusters using KMeans."""
    rfm_columns = ['UserID', 'Yearly_avg_view_on_travel_page', 'total_likes_on_outstation_checkin_given',
                   'Yearly_avg_comment_on_travel_page', 'total_likes_on_outofstation_checkin_received',
                   'week_since_last_outstation_checkin', 'montly_avg_comment_on_company_page',
                   'travelling_network_rating', 'Daily_Avg_mins_spend_on_traveling_page']

    rfm_df = df[rfm_columns].copy()
    rfm_df['Recency'] = rfm_df['week_since_last_outstation_checkin']
    rfm_df['Frequency'] = rfm_df['total_likes_on_outstation_checkin_given']
    rfm_df['Monetary'] = rfm_df['Yearly_avg_comment_on_travel_page']
    rfm_df = rfm_df[['UserID', 'Recency', 'Frequency', 'Monetary']]
    rfm_df.fillna(rfm_df.mean(), inplace=True)

    kmeans = KMeans(n_clusters=10, random_state=42)
    rfm_df['Cluster'] = kmeans.fit_predict(rfm_df[['Recency', 'Frequency', 'Monetary']])

    return rfm_df[['UserID', 'Cluster', 'Recency', 'Frequency', 'Monetary']]

In [6]:
def visualize_cluster_distribution(rfm_df):
    """Visualize the distribution of clusters for 'Recency'."""
    plt.hist(rfm_df['Recency'], bins=20)
    plt.xlabel('Recency')
    plt.ylabel('Frequency')
    plt.show()

In [7]:
def encode(df):
    """Encode non-numeric columns and display correlation heatmap."""
    label_encoder = LabelEncoder()

    for column in df.select_dtypes(include=['object']).columns:
        df[column] = label_encoder.fit_transform(df[column])

In [8]:
def train_random_forest_classifier(df):
    """Train a RandomForestClassifier on a subset of data."""
    X = df.drop(['cluster'], axis=1)
    y = df['cluster']

    # Create a pipeline with imputer and classifier steps
    numeric_cols = X.select_dtypes(include='number').columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols)
        ])

    rf_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    rf_model.fit(X_train, y_train)

    return rf_model, X_test

In [9]:
def impute_missing_values_and_predict(model, X_missing):
    """Impute missing values and predict clusters using the trained model."""
    X_missing_imputed = model.named_steps['preprocessor'].named_transformers_['num'].named_steps['imputer'].transform(X_missing)
    y_missing_pred = model.named_steps['classifier'].predict(X_missing_imputed)
    return y_missing_pred

In [10]:
def evaluate_model_accuracy(y_true, y_pred):
    """Evaluate model accuracy."""
    accuracy_missing = accuracy_score(y_true, y_pred)
    print(f'Model Accuracy on Data with Missing Values: {accuracy_missing}')

In [11]:
def create_package_categories(df):
    """Create package categories based on cluster assignment."""
    df['package_category'] = 'Unknown'
    economy_criteria = df['cluster'].isin([0, 1, 2])
    medium_criteria = df['cluster'].isin([3, 4, 5, 6])
    luxury_criteria = df['cluster'].isin([7, 8, 9])

    df.loc[economy_criteria, 'package_category'] = 'Economy'
    df.loc[medium_criteria, 'package_category'] = 'Medium'
    df.loc[luxury_criteria, 'package_category'] = 'Luxury'

    print("Package Category Counts:")
    print(df['package_category'].value_counts())

In [12]:
def assign_random_packages(df, economy_packages, medium_packages, luxury_packages):
    """Assign random packages based on package categories."""
    economy_packages = ['Staycation', 'Budget Hotels', 'Weekend Getaway']
    medium_packages = ['City Tour Package', 'Family Retreat', 'Adventure Package']
    luxury_packages = ['Luxury Spa Retreat', '5-Star Resort Experience', 'Exclusive Cruise']

    # Create a new column 'selected_package' to store the assigned package for each customer
    df['selected_package'] = 'Unknown'

    # Function to randomly assign a package based on category
    def assign_package(row):
        if row['package_category'] == 'Economy':
            return random.choice(economy_packages)
        elif row['package_category'] == 'Medium':
            return random.choice(medium_packages)
        elif row['package_category'] == 'Luxury':
            return random.choice(luxury_packages)
        else:
            return 'Unknown'

    # Apply the function to assign packages
    df['selected_package'] = df.apply(assign_package, axis=1)

    return df

In [13]:
# Load dataset
file_path = 'tourism.csv'
df = load_dataset(file_path)

# Explore dataset
explore_dataset(df)

# Replace missing values
df = replace_missing_values(df)

# Perform RFM analysis
rfm_df = perform_rfm_analysis(df)
df['cluster'] = rfm_df['Cluster']

# Visualize cluster distribution
# visualize_cluster_distribution(rfm_df)

# Encode and correlate
encode(df)

# Train RandomForestClassifier
rf_model, X_test = train_random_forest_classifier(df)

# Impute missing values and predict clusters
y_missing_pred = impute_missing_values_and_predict(rf_model, X_test)

# Evaluate model accuracy
evaluate_model_accuracy(rfm_df.loc[X_test.index, 'Cluster'], y_missing_pred)

# Create package categories
create_package_categories(df)

# Define packages for each category
economy_packages = ['Staycation', 'Budget Hotels', 'Weekend Getaway']
medium_packages = ['City Tour Package', 'Family Retreat', 'Adventure Package']
luxury_packages = ['Luxury Spa Retreat', '5-Star Resort Experience', 'Exclusive Cruise']

# Assign random packages
df = assign_random_packages(df, economy_packages, medium_packages, luxury_packages)

    UserID Taken_product  Yearly_avg_view_on_travel_page preferred_device  \
0  1000001           Yes                           307.0  iOS and Android   
1  1000002            No                           367.0              iOS   
2  1000003           Yes                           277.0  iOS and Android   
3  1000004            No                           247.0              iOS   
4  1000005            No                           202.0  iOS and Android   

   total_likes_on_outstation_checkin_given yearly_avg_Outstation_checkins  \
0                                  38570.0                              1   
1                                   9765.0                              1   
2                                  48055.0                              1   
3                                  48720.0                              1   
4                                  20685.0                              1   

  member_in_family preferred_location_type  Yearly_avg_comment_on_travel_p

  super()._check_params_vs_input(X, default_n_init=10)


Model Accuracy on Data with Missing Values: 0.9988662131519275
Package Category Counts:
package_category
Medium     3981
Economy    3953
Luxury     3826
Name: count, dtype: int64


In [14]:
# Display the resulting DataFrame
df.head()

Unnamed: 0,UserID,Taken_product,Yearly_avg_view_on_travel_page,preferred_device,total_likes_on_outstation_checkin_given,yearly_avg_Outstation_checkins,member_in_family,preferred_location_type,Yearly_avg_comment_on_travel_page,total_likes_on_outofstation_checkin_received,week_since_last_outstation_checkin,following_company_page,montly_avg_comment_on_company_page,working_flag,travelling_network_rating,Adult_flag,Daily_Avg_mins_spend_on_traveling_page,cluster,package_category,selected_package
0,1000001,1,307.0,9,38570.0,1,2,3,94.0,5993,8,3,11,1,1,0.0,8.0,2,Economy,Weekend Getaway
1,1000002,0,367.0,8,9765.0,1,0,3,61.0,5130,1,2,23,2,4,1.0,10.0,1,Economy,Budget Hotels
2,1000003,1,277.0,9,48055.0,1,2,10,92.0,2090,6,3,15,1,2,0.0,7.0,4,Medium,City Tour Package
3,1000004,0,247.0,8,48720.0,1,4,3,56.0,2909,1,3,11,1,3,0.0,8.0,4,Medium,Adventure Package
4,1000005,0,202.0,9,20685.0,1,0,7,40.0,3468,9,2,12,1,4,1.0,6.0,3,Medium,Family Retreat


In [15]:
df['selected_package']

0                 Weekend Getaway
1                   Budget Hotels
2               City Tour Package
3               Adventure Package
4                  Family Retreat
                   ...           
11755             Weekend Getaway
11756              Family Retreat
11757    5-Star Resort Experience
11758              Family Retreat
11759           City Tour Package
Name: selected_package, Length: 11760, dtype: object

In [16]:
df['selected_package'].isnull().sum()

0