<a href="https://colab.research.google.com/github/NajlaZuhir/CRM_Data/blob/main/AI_Profiling_draft_2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random
from datetime import datetime

In [27]:
def load_dataset(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)

In [28]:
def explore_dataset(df):
    """Explore the dataset by displaying the first few rows, summary statistics, and null values."""
    print(df.head())
    print(df.describe())
    print(df.isnull().sum())

In [29]:
def replace_missing_values(df):
    """Replace null values with mean for numeric columns and mode for non-numeric columns."""
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    non_numeric_cols = df.select_dtypes(exclude='number').columns
    df[non_numeric_cols] = df[non_numeric_cols].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else None))

    print(df.isnull().sum())
    return df

In [33]:

def calculate_rfm_scores(df, combined_date_column, num_of_visits_column, total_spendings_column,
                         recency_weight=3, frequency_weight=2, monetary_weight=1):

    """Calculate RFM scores and categories."""

    # Convert 'Combined_Date' to datetime format
    df['Combined_Date'] = pd.to_datetime(df[combined_date_column])

    # Calculate Recency
    current_date = datetime.now()
    df['Recency'] = (current_date - df['Combined_Date']).dt.days
    df['Recency_Score'] = pd.cut(df['Recency'], bins=5, labels=[5, 4, 3, 2, 1]) # 5 corresponds to the most recent and 1 corresponds to the least recent.

    # Calculate Frequency
    df['Frequency_Score'] = pd.cut(df[num_of_visits_column], bins=5, labels=[1, 2, 3, 4, 5]) # 5 indicates that the highest interval of number of visits

    # Calculate Monetary
    df['Monetary_Score'] = pd.cut(df[total_spendings_column], bins=5, labels=[1, 2, 3, 4, 5])

    # Assign weights to each component
    weight_recency = recency_weight
    weight_frequency = frequency_weight
    weight_monetary = monetary_weight

    # Convert categorical scores to numeric
    df['Recency_Score'] = df['Recency_Score'].astype(int)
    df['Frequency_Score'] = df['Frequency_Score'].astype(int)
    df['Monetary_Score'] = df['Monetary_Score'].astype(int)

    # Calculate RFM scores
    df['RFM_Score'] = (
        weight_recency * df['Recency_Score'] +
        weight_frequency * df['Frequency_Score'] +
        weight_monetary * df['Monetary_Score']
    )

    # Find the maximum RFM score
    max_rfm_score = df['RFM_Score'].max()

    # Create bins dynamically based on the maximum RFM score
    bins = [0, max_rfm_score * 0.2, max_rfm_score * 0.4, max_rfm_score * 0.6, max_rfm_score * 0.8, max_rfm_score]
    labels = ['Inactive or Lost', 'Churning or At Risk', 'Recent and Infrequent Visitors', 'Frequent Visitors', 'High-Value Tourists']

    # Define the categories based on RFM scores
    df['cluster'] = pd.cut(df['RFM_Score'], bins=bins, labels=labels)

    return df

# Example usage:
# df = calculate_rfm_scores(df, 'Combined_Date', 'Num_of_Visits', 'Total_Spendings', recency_weight=3, frequency_weight=2, monetary_weight=1)


In [31]:
def visualize_cluster_distribution(rfm_df):
    """Visualize the distribution of clusters for 'Recency'."""
    plt.hist(rfm_df['Recency'], bins=20)
    plt.xlabel('Recency')
    plt.ylabel('Frequency')
    plt.show()

In [32]:
def encode(df):
    """Encode non-numeric columns and display correlation heatmap."""
    label_encoder = LabelEncoder()

    for column in df.select_dtypes(include=['object']).columns:
        df[column] = label_encoder.fit_transform(df[column])

In [34]:
def train_random_forest_classifier(df):
    """Train a RandomForestClassifier on a subset of data."""
    X = df.drop(['cluster'], axis=1)
    y = df['cluster']

    # Create a pipeline with imputer and classifier steps
    numeric_cols = X.select_dtypes(include='number').columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols)
        ])

    rf_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    rf_model.fit(X_train, y_train)

    return rf_model, X_test

In [35]:
def evaluate_model_accuracy(y_true, y_pred):
    """Evaluate model accuracy."""
    accuracy_missing = accuracy_score(y_true, y_pred)
    print(f'Model Accuracy on Data with Missing Values: {accuracy_missing}')

In [40]:
# Load dataset
file_path = 'tourism_df.csv'
df = load_dataset(file_path)

# Explore dataset
explore_dataset(df)

# Replace missing values
df = replace_missing_values(df)

# Perform RFM analysis
rfm_df = calculate_rfm_scores(df, 'Combined_Date', 'Num_of_Visits', 'Total_Spendings', recency_weight=3, frequency_weight=2, monetary_weight=1)
df['cluster'] = rfm_df['cluster']

# Visualize cluster distribution
# visualize_cluster_distribution(rfm_df)

# Encode and correlate
encode(df)

# Train RandomForestClassifier
rf_model, X_test = train_random_forest_classifier(df)

# Generate predictions using the trained model and the test dataset
y_missing_pred = rf_model.predict(X_test)

# Evaluate the model's accuracy by comparing the true cluster labels with the predicted labels
evaluate_model_accuracy(rfm_df.loc[X_test.index, 'cluster'], y_missing_pred)


   Tourist_ID  Reservation_ID passport_number            name  \
0        1001               1       HHXRD9D90   Ernest Barnes   
1        1002               2       WLS8HDNPC    Andrea Baker   
2        1003               3       INZ9YKCVL  Rebecca Parker   
3        1004               4       A0VBK79WA    Laura Murray   
4        1005               5       CXADDEOGB     Linda Hines   

                         email  phone-number       credit_card  Age  \
0  Ernest.Barnes31@outlook.com  669-792-1661  ************4322   25   
1       Andrea_Baker94@aol.com  858-637-6955  ************9157   38   
2   Rebecca_Parker@comcast.net  652-885-2745  ************3734   28   
3            Laura_M@gmail.com  364-656-8427  ************5677   44   
4           LHines@verizon.com  713-226-5883  ************5498   18   

   workclass      marital-status  ... Hotel_Type            Location  \
0    Private       Never-married  ...     Budget             Al Khor   
1    Private  Married-civ-spouse  ... 