Loading Data:

In [389]:
import pandas as pd

# Load the Excel file into a DataFrame
data = pd.read_excel("Survey_ICS487_Project.xlsx")


Data Inspection:

In [390]:
# Check the first few rows
print(data.head())

# Get an overview of the data
print(data.info())

# Check for missing values
print(data.isnull().sum())


    Name  Salary (SAR)  Age    State     Sex  Monthly Debt (SAR)  \
0   saif          4833   23   Single  Female               83.88   
1  Aisha          3894   23   Single    Male              491.46   
2   Nada          2355   55  Married    Male              667.27   
3    Ali          3521   21   Single  Female              370.20   
4   Mona          3068   25   Single    Male              434.68   

   Elementary Expenses (SAR)        Goal  Number of Children  \
0                    2694.39     Savings                   0   
1                    3443.47     Savings                   0   
2                    9626.48  Investment                   4   
3                    3672.84  Investment                   0   
4                    1228.54     Savings                   0   

  Employment_Status Nationality Budgeting_Rule Financial_Comfortability  \
0           Student       Saudi       70/20/10                  ['yes']   
1           Student       Saudi       50/30/20          

Clean and Preprocess:

In [391]:
# Standardize column names
data.columns = [col.strip().lower().replace(" ", "_") for col in data.columns]
# Check for duplicates and remove them
data = data.drop_duplicates()

# Handle missing values (drop rows or fill them as needed)
data = data.dropna()  # Drop rows with missing values
# Alternatively, fill missing values (example: fill salary with median)
data['salary_(sar)'] = data['salary_(sar)'].fillna(data['salary_(sar)'].median())

# Ensure numeric columns are properly formatted
numeric_columns = ['salary_(sar)', 'age', 'monthly_debt_(sar)', 'elementary_expenses_(sar)']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Validate ranges for numeric columns
data = data[(data['age'] >= 18) & (data['age'] <= 100)]  # Age should be between 0 and 100
data = data[data['salary_(sar)'] >= 0]  # Salary should not be negative

# Filter out rows where the conditions are met
filtered_data = data[
    (data['nationality'].str.lower() == 'saudi') ]  # Keep only rows where nationality is 'saudi'
filtered_data = filtered_data[
    (filtered_data['financial_comfortability'].str.lower() != 'no') ]  # Exclude rows where financial_comfortability is 'no'
filtered_data = filtered_data[
    (data['goal_progress'].str.lower() != 'no')  # Exclude rows where goal_progress is 'no'
]

# Drop the irrelevant columns
filtered_data = filtered_data.drop(columns=['nationality', 'financial_comfortability', 'goal_progress'])

# Store the filtered data back if needed
data = filtered_data
print(data.tail())



       name  salary_(sar)  age    state     sex  monthly_debt_(sar)  \
1170  turki          1924   30  Married    Male              339.17   
1171  ahmed          3679   19   Single  Female              400.72   
1173  Noura          2749   35   Single    Male              451.26   
1174  ziyad          7516   34  Married    Male              616.58   
1175  Fahad         26217   34   Single    Male             1380.83   

      elementary_expenses_(sar)        goal  number_of_children  \
1170                    4811.47  Investment                   2   
1171                    2903.78     Savings                   0   
1173                    4993.05     Savings                   0   
1174                    4535.88     Savings                   3   
1175                    1917.85     Savings                   0   

     employment_status budgeting_rule  
1170           Student       50/30/20  
1171           Student       70/20/10  
1173        Unemployed       50/30/20  
1174      

  filtered_data = filtered_data[


Removing outliers:

In [392]:
import numpy as np

def remove_outliers_iqr(df, column):
    """
    Removes outliers from a column using the IQR method.
    """
    Q1 = df[column].quantile(0.25)  # First quartile
    Q3 = df[column].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply to numeric columns with potential outliers
numeric_columns = ['salary_(sar)', 'age', 'monthly_debt_(sar)', 'elementary_expenses_(sar)']
for col in numeric_columns:
    data = remove_outliers_iqr(data, col)


# Define age bins and labels
age_bins = [0, 24, 34, 44, 54, float('inf')]
age_labels = ['18-24', '25-34', '35-44', '45-54', '55+']

# Create a new column for age groups
data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=True)

data = data.drop(columns=['age'])


Encoding Categorical Variables:

In [393]:
from sklearn.preprocessing import LabelEncoder

# One-hot encode categorical columns
categorical_columns = ['state', 'sex', 'goal', 'employment_status']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Encode the age_group using LabelEncoder
label_encoder = LabelEncoder()
data['age_group_encoded'] = label_encoder.fit_transform(data['age_group'])

# Display the encoding
print("Age Group Encoding:")
for label, encoded in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{label}: {encoded}")

data = data.drop(columns=['age_group'])


Age Group Encoding:
18-24: 0
25-34: 1
35-44: 2
45-54: 3
55+: 4


Feature Engineering:

In [394]:
# Create new features
data['debt_to_income_ratio'] = data['monthly_debt_(sar)'] / data['salary_(sar)']
data['savings_ratio'] = data['elementary_expenses_(sar)'] / data['salary_(sar)']

# Handle infinite or NaN values due to division
data['debt_to_income_ratio'] = data['debt_to_income_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
data['savings_ratio'] = data['savings_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Drop irrelevant columns
data = data.drop(columns=['name'])

data.describe()

Unnamed: 0,salary_(sar),monthly_debt_(sar),elementary_expenses_(sar),number_of_children,age_group_encoded,debt_to_income_ratio,savings_ratio
count,739.0,739.0,739.0,739.0,739.0,739.0,739.0
mean,4058.55751,476.217118,5656.033329,1.309878,1.300406,0.148333,2.396972
std,4198.683241,376.510113,3442.699599,1.789777,1.244669,0.08106,2.20878
min,904.0,50.6,419.51,0.0,0.0,0.002883,0.048809
25%,1763.0,198.035,3048.495,0.0,0.0,0.078898,0.798022
50%,2521.0,365.98,4789.63,0.0,1.0,0.141615,1.793381
75%,4139.5,634.24,8273.735,3.0,2.0,0.215983,3.281426
max,22608.0,1803.12,15038.91,5.0,4.0,0.299766,13.914766


Saving Data:

In [395]:
# Save cleaned data to a new Excel file
data.to_excel("Cleaned_Survey_ICS487_Project.xlsx", index=False)

# Or save it as a CSV file
data.to_csv("Cleaned_Survey_ICS487_Project.csv", index=False)