In [89]:
import pandas as pd
import numpy as np

# 1️⃣ Load the dataset
df = pd.read_csv('Travel.csv')  # Make sure this file is in your working directory
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces from column names

# 2️⃣ Convert these columns to numeric types
# Sometimes numeric columns are read as 'object' due to bad entries (e.g. strings or blanks)
numeric_cols = [
    'Age', 'DurationOfPitch', 'NumberOfFollowups', 'PreferredPropertyStar',
    'NumberOfTrips', 'NumberOfChildrenVisiting', 'MonthlyIncome'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Converts invalid strings to NaN

# 3️⃣ Fill missing values in numeric columns with the median
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# 4️⃣ Fill missing values in categorical columns with the mode (most common value)
categorical_cols = [
    'TypeofContact', 'Occupation', 'Gender',
    'ProductPitched', 'MaritalStatus', 'Designation'
]

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# 5️⃣ Fix inconsistent labels in Gender and MaritalStatus
# Replace 'Fe Male' with 'Female', and 'Single' with 'Unmarried' for consistency
df['Gender'] = df['Gender'].replace({'Fe Male': 'Female'})
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single': 'Unmarried'})

# 6️⃣ One-hot encode categorical columns (convert them to numeric dummies)
# drop_first=True avoids dummy variable trap (by dropping first category from each)
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# 7️⃣ Create new feature: Income per person (prevents division by zero)
df['IncomePerPerson'] = df['MonthlyIncome'] / (df['NumberOfChildrenVisiting'] + 1)

# 8️⃣ Create new feature: Ratio of children to total trips
df['ChildrenRatio'] = df.apply(
    lambda row: row['NumberOfChildrenVisiting'] / row['NumberOfTrips'] if row['NumberOfTrips'] != 0 else 0,
    axis=1
)

# 9️⃣ Drop unnecessary columns (like CustomerID, if present)
if 'CustomerID' in df.columns:
    df.drop('CustomerID', axis=1, inplace=True)

# 🔟 Convert all boolean (True/False) values to integers (1/0)
# This is needed because most ML models expect numeric input
df = df.replace({True: 1, False: 0})

# 🔁 (Optional) Save cleaned dataset to a new CSV file
df.to_csv('Travel_cleaned.csv', index=False)

# 🔍 Summary Output
print("✅ Cleaning & feature engineering completed.")
print("📊 Final shape of dataset:", df.shape)
print("🧾 Columns in dataset:", df.columns.tolist())
print(df.head())













✅ Cleaning & feature engineering completed.
📊 Final shape of dataset: (4888, 33)
🧾 Columns in dataset: ['ProdTaken', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting', 'MonthlyIncome', 'TypeofContact_ Company Invited', 'TypeofContact_ Self Enquiry   ', 'Occupation_ Large Business', 'Occupation_ Salaried      ', 'Occupation_ Small Business', 'Gender_ Female ', 'Gender_ Male   ', 'ProductPitched_ Deluxe        ', 'ProductPitched_ King          ', 'ProductPitched_ Standard      ', 'ProductPitched_ Super Deluxe  ', 'MaritalStatus_ Married      ', 'MaritalStatus_ Single       ', 'MaritalStatus_ Unmarried    ', 'Designation_ Executive     ', 'Designation_ Manager       ', 'Designation_ Senior Manager', 'Designation_ VP            ', 'IncomePerPerson', 'ChildrenRatio']
   ProdTaken   Age  CityTier  DurationOfPitch  NumberOfPersonVisiting  \
0 