In [8]:
# Step 1: Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Step 2: Load the dataset
df = pd.read_csv('Day_18_Tours_and_Travels.csv')  # Replace with the actual file path

# Step 3: Initial exploration of the dataset
print("Dataset Information:")
print(df.info())  # Display info about the dataset (columns, non-null count, data types)

print("First Few Rows of Dataset:")
print(df.head())  # View the first few rows to get an idea of the data structure

# Step 4: Handle Missing Values

# Identify missing values
missing_data = df.isna().sum()
missing_percentage = df.isna().mean() * 100

print("Missing Values Count:")
print(missing_data)

print("Percentage of Missing Values:")
print(missing_percentage)

# Impute missing numerical columns (Rating, Customer_Age)
numerical_cols = ['Rating', 'Customer_Age']
imputer = SimpleImputer(strategy='mean')  # Using mean for numerical columns
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Impute missing textual data (Review_Text)
# We can fill missing Review_Text with a placeholder (e.g., "No Review Provided") for simplicity
df['Review_Text'] = df['Review_Text'].fillna("No Review Provided")

# Step 5: Detect and Remove Duplicates
# Identify duplicate reviews
duplicates = df.duplicated(subset=['Review_Text']).sum()

print(f"Number of duplicate reviews: {duplicates}")

# Remove duplicates based on the 'Review_Text' column
df = df.drop_duplicates(subset=['Review_Text'])

# Step 6: Handle Inconsistent Data

# Standardize Rating values to ensure they are between 1 and 5
df['Rating'] = df['Rating'].clip(lower=1, upper=5)

# Correct spelling inconsistencies in Tour_Package names
# You can standardize or manually fix some common inconsistencies.
# Example: use the `.replace()` method to correct misspellings:
df['Tour_Package'] = df['Tour_Package'].replace({
    'paris tour': 'Paris Tour',
    'paris trip': 'Paris Tour',
    'london experience': 'London Experience',
    'rome visit': 'Rome Visit'
    # Add more replacements as necessary
})

# Step 7: Identify and Handle Outliers

# Visualizing Package_Price and Rating outliers using boxplots
plt.figure(figsize=(12, 6))

# Boxplot for Package_Price
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x='Package_Price')
plt.title('Boxplot of Package_Price')

# Boxplot for Rating
plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Rating')
plt.title('Boxplot of Rating')

plt.tight_layout()
plt.show()

# Apply capping for Package_Price outliers (if necessary)
Q1_price = df['Package_Price'].quantile(0.25)
Q3_price = df['Package_Price'].quantile(0.75)
IQR_price = Q3_price - Q1_price

# Capping the outliers beyond 1.5*IQR
df['Package_Price'] = df['Package_Price'].clip(lower=Q1_price - 1.5 * IQR_price, upper=Q3_price + 1.5 * IQR_price)

# Step 8: Prepare Data for Analysis

# Convert categorical data (Tour_Package) into numerical format using Label Encoding
le = LabelEncoder()
df['Tour_Package'] = le.fit_transform(df['Tour_Package'])

# If needed, scale numerical columns like Package_Price for normalization
scaler = MinMaxScaler()
df[['Package_Price']] = scaler.fit_transform(df[['Package_Price']])

# Step 9: Data Validation

# Check for missing values after cleaning
missing_data_after = df.isna().sum()
print("Missing Values After Cleaning:")
print(missing_data_after)

# Ensure no duplicates remain
duplicates_after = df.duplicated(subset=['Review_Text']).sum()
print(f"Number of duplicate reviews after cleaning: {duplicates_after}")

# Step 10: Final Data Export
df.to_csv('cleaned_travel_customer_reviews.csv', index=False)

print("Cleaned dataset saved as 'cleaned_travel_customer_reviews.csv'")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Booking_ID     505 non-null    int64  
 1   Destination    478 non-null    object 
 2   Package_Price  485 non-null    float64
 3   Customer_Age   490 non-null    float64
 4   Rating         480 non-null    float64
 5   Review_Text    485 non-null    object 
 6   Travel_Date    495 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 27.7+ KB
None
First Few Rows of Dataset:
   Booking_ID Destination  Package_Price  Customer_Age  Rating  \
0           1      London         1094.0          21.0     NaN   
1           2       Paris          640.0          20.0     NaN   
2           3   Singapore         3393.0          42.0     3.0   
3           4      London         3555.0          40.0     4.0   
4           5       Dubai         2130.0          44.0

KeyError: 'Tour_Package'