Loading Data:

In [190]:
import pandas as pd

# Load the Excel file into a DataFrame
data = pd.read_excel("Survey_ICS487_Project.xlsx")


Data Inspection:

In [191]:
# Check the first few rows
print(data.head())

# Get an overview of the data
print(data.info())

# Check for missing values
print(data.isnull().sum())

      Name  Salary (SAR)  Age    State     Sex  Monthly Debt (SAR)  \
0  Fatimah          1241   60   Single    Male              138.19   
1   Salman          1881   60   Single  Female              336.72   
2   bander          7158   51  Married    Male              172.66   
3     Naif          2949   23  Married    Male              483.67   
4   faisal          3259   25  Married    Male              617.57   

   Elementary Expenses (SAR)        Goal  Number of Children  \
0                    3851.94  Investment                   0   
1                    4765.68  Investment                   0   
2                    7295.41  Investment                   0   
3                    5044.26  Investment                   1   
4                    4716.38  Investment                   5   

  Employment_Status Nationality Budgeting_Rule Financial_Comfortability  \
0        Unemployed       Saudi       50/30/20                  ['yes']   
1        Unemployed       Saudi       50/30/

Clean and Preprocess:

In [192]:
# Standardize column names
data.columns = [col.strip().lower().replace(" ", "_") for col in data.columns]

# Check for duplicates and remove them
data = data.drop_duplicates()

# Handle missing values (drop rows or fill them as needed)
data = data.dropna()  # Drop rows with missing values
# Alternatively, fill missing values (example: fill salary with median)
data['salary_(sar)'] = data['salary_(sar)'].fillna(data['salary_(sar)'].median())

# Ensure numeric columns are properly formatted
numeric_columns = ['salary_(sar)', 'age', 'monthly_debt_(sar)', 'elementary_expenses_(sar)']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Validate ranges for numeric columns
data = data[(data['age'] >= 18) & (data['age'] <= 100)]  # Age should be between 0 and 100
data = data[data['salary_(sar)'] >= 0]  # Salary should not be negative

# Filter irrlivant records 
data = data[data['nationality'].str.lower() == 'saudi']
data = data[data['financial_comfortability'].str.lower() == 'no']
data = data[data['goal_progress'].str.lower() == 'no']

# Drop irrlivant column
data = data.drop(columns=['nationality'])
data = data.drop(columns=['financial_comfortability'])
data = data.drop(columns=['goal_progress'])




Removing outliers:

In [193]:
import numpy as np

def remove_outliers_iqr(df, column):
    """
    Removes outliers from a column using the IQR method.
    """
    Q1 = df[column].quantile(0.25)  # First quartile
    Q3 = df[column].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply to numeric columns with potential outliers
numeric_columns = ['salary_(sar)', 'age', 'monthly_debt_(sar)', 'elementary_expenses_(sar)']
for col in numeric_columns:
    data = remove_outliers_iqr(data, col)


# Define age bins and labels
age_bins = [0, 24, 34, 44, 54, float('inf')]
age_labels = ['18-24', '25-34', '35-44', '45-54', '55+']

# Create a new column for age groups
data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=True)

data = data.drop(columns=['age'])

Encoding Categorical Variables:

In [194]:
from sklearn.preprocessing import LabelEncoder

# One-hot encode categorical columns
categorical_columns = ['state', 'sex', 'goal', 'employment_status']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Encode the age_group using LabelEncoder
label_encoder = LabelEncoder()
data['age_group_encoded'] = label_encoder.fit_transform(data['age_group'])

# Display the encoding
print("Age Group Encoding:")
for label, encoded in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{label}: {encoded}")

data = data.drop(columns=['age_group'])

Age Group Encoding:


Feature Engineering:

In [195]:
# Create new features
data['debt_to_income_ratio'] = data['monthly_debt_(sar)'] / data['salary_(sar)']
data['savings_ratio'] = data['elementary_expenses_(sar)'] / data['salary_(sar)']

# Handle infinite or NaN values due to division
data['debt_to_income_ratio'] = data['debt_to_income_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
data['savings_ratio'] = data['savings_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Drop irrelevant columns
data = data.drop(columns=['name'])

data.describe()

Unnamed: 0,salary_(sar),monthly_debt_(sar),elementary_expenses_(sar),number_of_children,age_group_encoded,debt_to_income_ratio,savings_ratio
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,
std,,,,,,,
min,,,,,,,
25%,,,,,,,
50%,,,,,,,
75%,,,,,,,
max,,,,,,,


Saving Data:

In [196]:
# Save cleaned data to a new Excel file
data.to_excel("Cleaned_Survey_ICS487_Project.xlsx", index=False)

# Or save it as a CSV file
data.to_csv("Cleaned_Survey_ICS487_Project.csv", index=False)