In [3]:
import pandas as pd
import numpy as np 
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load Data
file_path = "F:\\project data\\travel_data.csv"
data = pd.read_csv(file_path)
print("Data Loaded Successfully.")
print(data.head())  # Display the first few rows of the dataset

Data Loaded Successfully.
   Traveler_ID  Age  Gender Destination Travel_Type  Trip_Duration_Days  \
0            1   25    Male       Paris     Leisure                   7   
1            2   34  Female    New York    Business                  10   
2            3   40    Male       Tokyo     Leisure                   5   
3            4   28  Female      London     Leisure                  14   
4            5   22    Male        Rome    Business                   4   

   Season Accommodation_Type  Budget_USD Frequent_Flyer  
0  Summer              Hotel        1500            Yes  
1  Winter             Airbnb        2500             No  
2  Spring              Hotel        1200            Yes  
3  Autumn             Hostel        1000            Yes  
4  Summer              Hotel        1800             No  


In [5]:
# Step 2: Data Cleaning
# Handle missing values
data = data.dropna()  # Drop rows with missing values

# Remove duplicates
data = data.drop_duplicates()
print("Data Cleaned. No missing values or duplicates.")

Data Cleaned. No missing values or duplicates.


In [7]:
# Identify categorical columns
categorical_cols = ['Gender', 'Destination', 'Travel_Type', 'Season', 'Accommodation_Type']

# Check which columns exist in the dataset spicific data
available_categorical_cols = [col for col in categorical_cols if col in data.columns]

# Apply one-hot encoding to existing categorical columns
if available_categorical_cols:
    data = pd.get_dummies(data, columns=available_categorical_cols, drop_first=True)
    print("Categorical variables encoded successfully.")
else:
    print("No categorical columns to encode.")


Categorical variables encoded successfully.


In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

# Specify file path
file_path = "F:\\project data\\travel_data.csv"

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found at {file_path}. Please check the path.")
else:
    # Load Data
    data = pd.read_csv(file_path)

    # Make a copy of the dataset for preprocessing
    clean_data = data.copy()

    # Select numeric columns
    numeric_cols = clean_data.select_dtypes(include=['float64', 'int64']).columns

    # Show numeric features before scaling
    print("Numeric features before scaling:")
    print(clean_data[numeric_cols].head())

    # Apply StandardScaler to scale numeric features
    scaler = StandardScaler()
    clean_data[numeric_cols] = scaler.fit_transform(clean_data[numeric_cols])

    # Show numeric features after scaling
    print("Numeric features after scaling:")
    print(clean_data[numeric_cols].head())


Numeric features before scaling:
   Traveler_ID  Age  Trip_Duration_Days  Budget_USD
0            1   25                   7        1500
1            2   34                  10        2500
2            3   40                   5        1200
3            4   28                  14        1000
4            5   22                   4        1800
Numeric features after scaling:
   Traveler_ID       Age  Trip_Duration_Days  Budget_USD
0    -1.566699 -1.013295           -0.386739   -0.317021
1    -1.218544 -0.265782            0.668004    1.268085
2    -0.870388  0.232559           -1.089902   -0.792553
3    -0.522233 -0.764124            2.074330   -1.109575
4    -0.174078 -1.262465           -1.441483    0.158511


In [13]:
# Step 6: Splitting Data
# Define a target column (for demonstration purposes, assume 'Frequent_Flyer' is the target)
target = 'Frequent_Flyer'
X = clean_data.drop(columns=[target])
y = clean_data[target]

# Split the dataset into training (70%) and temporary (30%) datasets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary dataset into validation (50%) and testing (50%) datasets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display processed data shapes
print("Training Data Shape:", X_train.shape)
print("Validation Data Shape:", X_val.shape)
print("Testing Data Shape:", X_test.shape)

Training Data Shape: (7, 9)
Validation Data Shape: (1, 9)
Testing Data Shape: (2, 9)


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 1: Load Data
file_path = "F:\\project data\\travel_data.csv"   # Updated file path for the uploaded dataset

# Check if the file exists
if not os.path.exists(file_path):
    print(f"The file was not found at: {file_path}")
else:
    # Load the dataset
    data = pd.read_csv(file_path)
    print("Dataset loaded successfully.")

    # Step 2: Data Cleaning
    data = data.dropna()  # Drop rows with missing values
    data = data.drop_duplicates()  # Remove duplicates

    # Step 3: Data Transformation
    # Identify categorical columns
    categorical_cols = ['Gender', 'Destination', 'Travel_Type', 'Season', 'Accommodation_Type']

    # Apply one-hot encoding to all categorical columns
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    print("Categorical variables encoded.")

    # Step 4: Define Features (X) and Target (y)
    target = 'Frequent_Flyer'
    if target not in data.columns:
        raise ValueError(f"Target column '{target}' not found in dataset.")

    X = data.drop(columns=[target])
    y = data[target]

    # Step 5: Split the Data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Step 6: Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    # Step 7: Train and Evaluate a Model
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Validation evaluation
    y_val_pred = clf.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy:.2f}")

    # Test evaluation
    y_test_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test Accuracy: {test_accuracy:.2f}")

    # Step 8: Save Processed Datasets
    output_dir = "./processed_data"  # Save in current directory
    os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

    try:
        # Convert transformed data back to DataFrame for saving
        X_train_df = pd.DataFrame(X_train, columns=X.columns)
        X_val_df = pd.DataFrame(X_val, columns=X.columns)
        X_test_df = pd.DataFrame(X_test, columns=X.columns)

        # Save datasets
        X_train_df.to_csv(os.path.join(output_dir, "travel_data_train.csv"), index=False)
        X_val_df.to_csv(os.path.join(output_dir, "travel_data_val.csv"), index=False)
        X_test_df.to_csv(os.path.join(output_dir, "travel_data_test.csv"), index=False)

        y_train.to_csv(os.path.join(output_dir, "travel_data_train_labels.csv"), index=False)
        y_val.to_csv(os.path.join(output_dir, "travel_data_val_labels.csv"), index=False)
        y_test.to_csv(os.path.join(output_dir, "travel_data_test_labels.csv"), index=False)

        print("Data processing and model evaluation complete. Files saved:")
        print(f"Train: {os.path.join(output_dir, 'travel_data_train.csv')}")
        print(f"Validation: {os.path.join(output_dir, 'travel_data_val.csv')}")
        print(f"Test: {os.path.join(output_dir, 'travel_data_test.csv')}")

    except PermissionError as e:
        print(f"Permission Error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")


Dataset loaded successfully.
Categorical variables encoded.
Validation Accuracy: 1.00
Test Accuracy: 0.50
Data processing and model evaluation complete. Files saved:
Train: ./processed_data\travel_data_train.csv
Validation: ./processed_data\travel_data_val.csv
Test: ./processed_data\travel_data_test.csv
