In [1]:
import pandas as pd
import numpy as np

In [2]:
# Example bucketing functions
def bucket_age(age):
    bins = [0, 24, 34, 44, 54, 64, 100]
    labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
    return pd.cut([age], bins=bins, labels=labels)[0]


In [3]:
def bucket_income(income):
    bins = [0, 30000, 50000, 70000, np.inf]
    labels = ['<30,000', '30,000–50,000', '50,000–70,000', '>70,000']
    return pd.cut([income], bins=bins, labels=labels)[0]


In [4]:
# Load your new Kaggle-style data and survey data
kaggle_df = pd.read_csv(r"C:\Users\manod\Desktop\Capstone\Data\financial_risk_assessment.csv")

# Survey Data Preprocessing
survey_df = pd.read_csv(r"C:\Users\manod\Desktop\Capstone\Data\Survey Data.csv")

In [5]:
# Set random seed for reproducibility
np.random.seed(42)

In [6]:
# Apply buckets
kaggle_df['age_bucket'] = kaggle_df['Age'].apply(bucket_age)
kaggle_df['income_bucket'] = kaggle_df['Income'].apply(bucket_income)
kaggle_df['has_loan'] = (kaggle_df['Loan Amount'] > 0).astype(int)
#kaggle_df['has_dependents'] = (kaggle_df['Number of Dependents'] > 0).astype(int)

In [214]:
import numpy as np
import pandas as pd

np.random.seed(42)

# ---- 1. Assign Risk Preference with Score + Jitter ----
def assign_risk_preference(row):
    score = 0
    if row['age_bucket'] in ["18-24", "25-34"]: score += 1
    if row['income_bucket'] == "70,000+": score += 1
    if row['Credit Score'] in [">700", "681-700"]: score += 1
    if row['Years at Current Job'] > 5: score += 1
    if row['Number of Dependents'] < 2: score += 1
    if not row.get('has_loan', False): score += 1
    if row['Previous Defaults'] == 0: score += 1
    if row['Payment History'] == "Good": score += 1

    # Add jitter to break deterministic scoring
    score += np.random.randint(-1, 2)

    if row['Previous Defaults'] > 2 or row['Payment History'] == "Poor":
        return np.random.choice(["Low Risk", "Risk Taker"], p=[0.9, 0.1])

    if score >= 6:
        return np.random.choice(["Risk Taker", "Low Risk"], p=[0.85, 0.15])
    elif score >= 3:
        return np.random.choice(["Risk Taker", "Low Risk"], p=[0.6, 0.4])
    else:
        return "Low Risk"

# ---- 2. Label Noise Injector ----
def flip_label(label, flip_prob=0.15):
    return "Risk Taker" if label == "Low Risk" and np.random.rand() < flip_prob else \
           "Low Risk" if label == "Risk Taker" and np.random.rand() < flip_prob else label

# ---- 3. Other Simulated Columns ----
def assign_reaction_to_loss(row):
    base_probs = {
        "Low Risk": [0.4, 0.4, 0.2],
        "Risk Taker": [0.1, 0.3, 0.6]
    }
    prob = base_probs.get(row['risk_preference'], [0.4, 0.3, 0.3])
    return np.random.choice(
        ["Sell everything to avoid further losses", "Hold and wait it out", "Invest more while prices are low"],
        p=prob
    )

def simulate_saving_frequency(row):
    if row['income_bucket'] == "<30,000" or row.get('employment_status') == "Unemployed":
        return np.random.choice(["Monthly", "Rarely"], p=[0.5, 0.5])
    elif row['age_bucket'] in ["18-25", "26-35"]:
        return np.random.choice(["Monthly", "Weekly", "Rarely"], p=[0.7, 0.25, 0.05])
    else:
        return np.random.choice(["Monthly", "Weekly"], p=[0.85, 0.15])

def simulate_investment_frequency(row):
    if row['income_bucket'] == "70,000+" or row['risk_preference'] == "Risk Taker":
        return np.random.choice(["Regularly", "Occasionally"], p=[0.85, 0.15])
    elif row['risk_preference'] == "Low Risk":
        return np.random.choice(["Occasionally", "Rarely"], p=[0.7, 0.3])
    else:
        return np.random.choice(["Regularly", "Occasionally"], p=[0.65, 0.35])

def simulate_has_insurance(row):
    if row['income_bucket'] in ["50,000-69,999", "70,000+"] or row['age_bucket'] in ["46-55", "56-65", "65+"] or row.get('has_dependents'):
        return np.random.choice(["Yes", "No"], p=[0.97, 0.03])
    elif row['age_bucket'] in ["18-25", "26-35"]:
        return np.random.choice(["Yes", "No"], p=[0.8, 0.2])
    else:
        return np.random.choice(["Yes", "No"], p=[0.9, 0.1])

def simulate_financial_goal(row):
    if row['age_bucket'] in ["55-64", "65+"]:
        return "Retirement"
    elif row['age_bucket'] in ["18-24", "25-34"]:
        return np.random.choice(["General wealth building", "Travel"], p=[0.85, 0.15])
    elif row['income_bucket'] == "70,000+":
        return np.random.choice(["Retirement", "General wealth building"], p=[0.6, 0.4])
    else:
        return np.random.choice(["Retirement", "General wealth building"], p=[0.45, 0.55])

def simulate_time_horizon(row):
    if row['age_bucket'] in ["55-64", "65+"]:
        return np.random.choice(["3 - 10 years", "More than 10 years"], p=[0.8, 0.2])
    elif row['age_bucket'] in ["18-24", "25-34"]:
        return "More than 10 years"
    else:
        return np.random.choice(["3 - 10 years", "More than 10 years"], p=[0.5, 0.5])

# ---- 4. Inject Feature Noise ----
def inject_feature_noise(df, column, fraction=0.4):
    noisy_indices = df.sample(frac=fraction, random_state=42).index
    df.loc[noisy_indices, column] = np.random.permutation(df.loc[noisy_indices, column].values)
    return df

# ---- 5. Apply to DataFrame ----
# You must already have a DataFrame `kaggle_df`
kaggle_df['risk_preference'] = kaggle_df.apply(assign_risk_preference, axis=1)
kaggle_df['risk_preference'] = kaggle_df['risk_preference'].apply(lambda x: flip_label(x, flip_prob=0.15))

kaggle_df['reaction_to_loss'] = kaggle_df.apply(assign_reaction_to_loss, axis=1)
kaggle_df['saving_frequency'] = kaggle_df.apply(simulate_saving_frequency, axis=1)
kaggle_df['investment_frequency'] = kaggle_df.apply(simulate_investment_frequency, axis=1)
kaggle_df['has_insurance'] = kaggle_df.apply(simulate_has_insurance, axis=1)
kaggle_df['financial_goal'] = kaggle_df.apply(simulate_financial_goal, axis=1)
kaggle_df['time_horizon'] = kaggle_df.apply(simulate_time_horizon, axis=1)

# Inject noise in features
for col, frac in {
    'investment_frequency': 0.5,
    'has_insurance': 0.4,
    'saving_frequency': 0.5,
    'time_horizon': 0.5,
    'financial_goal': 0.4
}.items():
    kaggle_df = inject_feature_noise(kaggle_df, col, fraction=frac)


In [7]:
import numpy as np
import pandas as pd

np.random.seed(42)

# ---- 1. Assign Risk Preference with Score + Jitter ----
def assign_risk_preference(row):
    score = 0
    if row['age_bucket'] in ["18-24", "25-34"]: score += 1
    if row['income_bucket'] == "70,000+": score += 1
    if row['Credit Score'] in [">700", "681-700"]: score += 1
    if row['Years at Current Job'] > 5: score += 1
    if row['Number of Dependents'] < 2: score += 1
    if not row.get('has_loan', False): score += 1
    if row['Previous Defaults'] == 0: score += 1
    if row['Payment History'] == "Good": score += 1

    # Add jitter to break deterministic scoring
    score += np.random.randint(-1, 2)

    if row['Previous Defaults'] > 2 or row['Payment History'] == "Poor":
        return np.random.choice(["Low Risk", "Risk Taker"], p=[0.9, 0.1])

    if score >= 6:
        return np.random.choice(["Risk Taker", "Low Risk"], p=[0.85, 0.15])
    elif score >= 3:
        return np.random.choice(["Risk Taker", "Low Risk"], p=[0.6, 0.4])
    else:
        return "Low Risk"

# ---- 2. Label Noise Injector ----
def flip_label(label, flip_prob=0.15):
    return "Risk Taker" if label == "Low Risk" and np.random.rand() < flip_prob else \
           "Low Risk" if label == "Risk Taker" and np.random.rand() < flip_prob else label

# ---- 3. Simulate Other Columns ----
def assign_reaction_to_loss(row):
    base_probs = {
        "Low Risk": [0.4, 0.4, 0.2],
        "Risk Taker": [0.1, 0.3, 0.6]
    }
    prob = base_probs.get(row['risk_preference'], [0.4, 0.3, 0.3])
    return np.random.choice(
        ["Sell everything to avoid further losses", "Hold and wait it out", "Invest more while prices are low"],
        p=prob
    )

def simulate_saving_frequency(row):
    if row['income_bucket'] == "<30,000" or row.get('employment_status') == "Unemployed":
        return np.random.choice(["Monthly", "Rarely"], p=[0.5, 0.5])
    elif row['age_bucket'] in ["18-25", "26-35"]:
        return np.random.choice(["Monthly", "Weekly", "Rarely"], p=[0.7, 0.25, 0.05])
    else:
        return np.random.choice(["Monthly", "Weekly"], p=[0.85, 0.15])

def simulate_investment_frequency(row):
    if row['income_bucket'] == "70,000+" or row['risk_preference'] == "Risk Taker":
        return np.random.choice(["Regularly", "Occasionally"], p=[0.85, 0.15])
    elif row['risk_preference'] == "Low Risk":
        return np.random.choice(["Occasionally", "Rarely"], p=[0.7, 0.3])
    else:
        return np.random.choice(["Regularly", "Occasionally"], p=[0.65, 0.35])

def simulate_has_insurance(row):
    if row['income_bucket'] in ["50,000-69,999", "70,000+"] or row['age_bucket'] in ["46-55", "56-65", "65+"] or row.get('has_dependents'):
        return np.random.choice(["Yes", "No"], p=[0.97, 0.03])
    elif row['age_bucket'] in ["18-25", "26-35"]:
        return np.random.choice(["Yes", "No"], p=[0.8, 0.2])
    else:
        return np.random.choice(["Yes", "No"], p=[0.9, 0.1])

def simulate_financial_goal(row):
    if row['age_bucket'] in ["55-64", "65+"]:
        return "Retirement"
    elif row['age_bucket'] in ["18-24", "25-34"]:
        return np.random.choice(["General wealth building", "Travel"], p=[0.85, 0.15])
    elif row['income_bucket'] == "70,000+":
        return np.random.choice(["Retirement", "General wealth building"], p=[0.6, 0.4])
    else:
        return np.random.choice(["Retirement", "General wealth building"], p=[0.45, 0.55])

def simulate_time_horizon(row):
    if row['age_bucket'] in ["55-64", "65+"]:
        return np.random.choice(["3 - 10 years", "More than 10 years"], p=[0.8, 0.2])
    elif row['age_bucket'] in ["18-24", "25-34"]:
        return "More than 10 years"
    else:
        return np.random.choice(["3 - 10 years", "More than 10 years"], p=[0.5, 0.5])

# ---- 4. Inject Feature Noise ----
def inject_feature_noise(df, column, fraction=0.4):
    noisy_indices = df.sample(frac=fraction, random_state=42).index
    df.loc[noisy_indices, column] = np.random.permutation(df.loc[noisy_indices, column].values)
    return df

# ---- 5. Apply to DataFrame ----
def simulate_behavioral_features(df):
    df['risk_preference'] = df.apply(assign_risk_preference, axis=1)
    df['risk_preference'] = df['risk_preference'].apply(lambda x: flip_label(x, flip_prob=0.15))

    df['reaction_to_loss'] = df.apply(assign_reaction_to_loss, axis=1)
    df['saving_frequency'] = df.apply(simulate_saving_frequency, axis=1)
    df['investment_frequency'] = df.apply(simulate_investment_frequency, axis=1)
    df['has_insurance'] = df.apply(simulate_has_insurance, axis=1)
    df['financial_goal'] = df.apply(simulate_financial_goal, axis=1)
    df['time_horizon'] = df.apply(simulate_time_horizon, axis=1)

    # Inject feature noise
    noise_map = {
        'investment_frequency': 0.5,
        'has_insurance': 0.4,
        'saving_frequency': 0.5,
        'time_horizon': 0.5,
        'financial_goal': 0.4
    }
    for col, frac in noise_map.items():
        df = inject_feature_noise(df, col, fraction=frac)

    return df

# ---- 6. Run Simulation ----
kaggle_df = simulate_behavioral_features(kaggle_df)


In [10]:
print(kaggle_df['Risk Rating'].value_counts())


Risk Rating
Low       9000
Medium    4500
High      1500
Name: count, dtype: int64


In [11]:
kaggle_df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,...,age_bucket,income_bucket,has_loan,risk_preference,reaction_to_loss,saving_frequency,investment_frequency,has_insurance,financial_goal,time_horizon
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,...,45-54,">70,000",1,Low Risk,Hold and wait it out,Monthly,Rarely,Yes,Retirement,More than 10 years
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,...,55-64,,1,Low Risk,Invest more while prices are low,Monthly,Occasionally,No,Retirement,More than 10 years
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,...,18-24,"50,000–70,000",1,Low Risk,Sell everything to avoid further losses,Monthly,Occasionally,Yes,General wealth building,More than 10 years
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,...,55-64,"<30,000",1,Low Risk,Invest more while prices are low,Weekly,Rarely,Yes,Retirement,More than 10 years
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,...,25-34,"30,000–50,000",1,Low Risk,Sell everything to avoid further losses,Monthly,Occasionally,Yes,General wealth building,More than 10 years


In [12]:
print(kaggle_df.isnull().sum())

Age                         0
Gender                      0
Education Level             0
Marital Status              0
Income                   2250
Credit Score             2250
Loan Amount              2250
Loan Purpose                0
Employment Status           0
Years at Current Job        0
Payment History             0
Debt-to-Income Ratio        0
Assets Value             2250
Number of Dependents     2250
City                        0
State                       0
Country                     0
Previous Defaults        2250
Marital Status Change       0
Risk Rating                 0
age_bucket                  0
income_bucket            2250
has_loan                    0
risk_preference             0
reaction_to_loss            0
saving_frequency            0
investment_frequency        0
has_insurance               0
financial_goal              0
time_horizon                0
dtype: int64


In [13]:
def assign_reaction_to_loss(row):
    base_probs = {
        "Low Risk": [0.4,0.4,0.2],
        "Risk Taker": [0.0,0.3,0.7]
    }
    prob = base_probs.get(row['risk_preference'], [0.4,0.3, 0.3])
    return np.random.choice(
        ["Sell everything to avoid further losses","Hold and wait it out", "Invest more while prices are low"]
    )

In [14]:
kaggle_df['Credit Score'] = kaggle_df['Credit Score'].fillna(kaggle_df['Credit Score'].mean())

In [15]:
kaggle_df['Number of Dependents'] = kaggle_df['Number of Dependents'].fillna(0)

In [16]:
# Define possible buckets (adjust to your actual bucket names)
income_buckets = ['<30,000', '30,000–50,000', '50,000–70,000', '>70,000']

def fill_income(row):
    if pd.isnull(row['income_bucket']):
        # You can check other conditions here:
        if 'Employment Status' in row and row['Employment Status'] == "Unemployed" and row.get('years_at_job', 1) == 0:
            return "<30,000"
        else:
            # You can use weighted or uniform random. Here is uniform:
            return np.random.choice(income_buckets)
    else:
        return row['income_bucket']

kaggle_df['income_bucket'] = kaggle_df.apply(fill_income, axis=1)

In [17]:
def assign_financial_knowledge(risk_rating):
    if 'Risk Rating' == 'Low':
        # 80% Very poor, 20% Beginner
        return np.random.choice(['Very poor', 'Beginner'], p=[0.8, 0.2])
    elif 'Risk Rating' == 'Medium':
        # 70% Beginner, 30% Good
        return np.random.choice(['Beginner', 'Intermediate'], p=[0.7, 0.3])
    elif 'Risk Rating' == 'High':
        # 30% Good, 70% Advanced
        return np.random.choice(['Intermediate', 'Advanced'], p=[0.3, 0.7])
    else:
        # Default/fallback
        return np.random.choice(['Very poor', 'Beginner', 'Intermediate', 'Advanced'])

# If your risk rating column is 'risk_rating_num', use that:
kaggle_df['financial_knowledge_level'] = kaggle_df['Risk Rating'].apply(assign_financial_knowledge)


In [18]:
kaggle_df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,...,income_bucket,has_loan,risk_preference,reaction_to_loss,saving_frequency,investment_frequency,has_insurance,financial_goal,time_horizon,financial_knowledge_level
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,...,">70,000",1,Low Risk,Hold and wait it out,Monthly,Rarely,Yes,Retirement,More than 10 years,Intermediate
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,...,">70,000",1,Low Risk,Invest more while prices are low,Monthly,Occasionally,No,Retirement,More than 10 years,Intermediate
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,...,"50,000–70,000",1,Low Risk,Sell everything to avoid further losses,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Beginner
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,...,"<30,000",1,Low Risk,Invest more while prices are low,Weekly,Rarely,Yes,Retirement,More than 10 years,Advanced
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,...,"30,000–50,000",1,Low Risk,Sell everything to avoid further losses,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Intermediate


converting categorical variable to numeric for training the model

In [19]:
risk_map={
    "Low Risk": 0,
    "Risk Taker": 1}
reaction_map={
    "Sell everything to avoid further losses": 0,
    "Hold and wait it out": 1,
    "Invest more while prices are low": 2
}

financial_knowledge_map = {
    "Very poor": 0,
    "Beginner": 1,
    "Intermediate": 2,
    "Advanced": 3
}

risk_rating_map = {
    'Low': 0,
    'Medium':1,
    'High': 2
}

kaggle_df['risk_preference_num'] = kaggle_df['risk_preference'].map(risk_map)
kaggle_df['reaction_to_loss_num'] = kaggle_df['reaction_to_loss'].map(reaction_map)
kaggle_df['financial_knowledge_num'] = kaggle_df['financial_knowledge_level'].map(financial_knowledge_map)
kaggle_df['risk_rating_num'] = kaggle_df['Risk Rating'].map(risk_rating_map)

In [20]:
kaggle_df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,...,saving_frequency,investment_frequency,has_insurance,financial_goal,time_horizon,financial_knowledge_level,risk_preference_num,reaction_to_loss_num,financial_knowledge_num,risk_rating_num
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,...,Monthly,Rarely,Yes,Retirement,More than 10 years,Intermediate,0,1,2,0
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,...,Monthly,Occasionally,No,Retirement,More than 10 years,Intermediate,0,2,2,1
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,...,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Beginner,0,0,1,1
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,...,Weekly,Rarely,Yes,Retirement,More than 10 years,Advanced,0,2,3,1
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,...,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Intermediate,0,0,2,0


In [21]:
# Replace 'financial_knowledge_level' with your column name
null_rows = kaggle_df[kaggle_df['reaction_to_loss_num']
                      .isnull()]
print(null_rows)

Empty DataFrame
Columns: [Age, Gender, Education Level, Marital Status, Income, Credit Score, Loan Amount, Loan Purpose, Employment Status, Years at Current Job, Payment History, Debt-to-Income Ratio, Assets Value, Number of Dependents, City, State, Country, Previous Defaults, Marital Status Change, Risk Rating, age_bucket, income_bucket, has_loan, risk_preference, reaction_to_loss, saving_frequency, investment_frequency, has_insurance, financial_goal, time_horizon, financial_knowledge_level, risk_preference_num, reaction_to_loss_num, financial_knowledge_num, risk_rating_num]
Index: []

[0 rows x 35 columns]


In [22]:
kaggle_df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,...,saving_frequency,investment_frequency,has_insurance,financial_goal,time_horizon,financial_knowledge_level,risk_preference_num,reaction_to_loss_num,financial_knowledge_num,risk_rating_num
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,...,Monthly,Rarely,Yes,Retirement,More than 10 years,Intermediate,0,1,2,0
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,...,Monthly,Occasionally,No,Retirement,More than 10 years,Intermediate,0,2,2,1
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,...,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Beginner,0,0,1,1
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,...,Weekly,Rarely,Yes,Retirement,More than 10 years,Advanced,0,2,3,1
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,...,Monthly,Occasionally,Yes,General wealth building,More than 10 years,Intermediate,0,0,2,0


In [23]:
# Mapping Kaggle columns to new names, including risk_rating
kaggle_to_survey_mapping = {
    'Gender': 'gender',
    'Employment Status': 'occupation',
    'Education Level': 'education_level',
    'Marital Status': 'marital_status',
    'has_loan': 'has_loan',
    'Years at Current Job': 'job_tenure',
    'Number of Dependents': 'number_of_dependents',
    'risk_preference': 'risk_preference',
    'reaction_to_loss': 'reaction_to_loss',
    'financial_knowledge_level': 'financial_knowledge_level',
    'Credit Score': 'credit_score',
    'Loan Purpose': 'loan_purpose',
    'Payment History': 'payment_history',
    'Debt-to-Income Ratio': 'debt_to_income_ratio',
    'Risk Rating': 'risk_rating'
}

# List of columns to include (in desired order)
final_cols = [
    'age_bucket', 'income_bucket', 'gender', 'occupation', 'education_level', 'marital_status','has_loan', 'risk_preference', 'reaction_to_loss', 'financial_knowledge_level', 'credit_score','job_tenure','number_of_dependents',
    'loan_purpose', 'payment_history', 'debt_to_income_ratio', 'risk_preference_num','reaction_to_loss_num', 'financial_knowledge_num','saving_frequency','investment_frequency','has_insurance','financial_goal','time_horizon', 'risk_rating', 'risk_rating_num'
]

# Rename and select columns
kaggle_df_renamed = kaggle_df.rename(columns=kaggle_to_survey_mapping)[final_cols]

# Save to CSV
kaggle_df_renamed.to_csv(r"C:\Users\manod\Desktop\Capstone\Data\Financial_risk_assesement_final.csv", index=False)

In [47]:
kaggle_df_renamed.head()

Unnamed: 0,age_bucket,income_bucket,gender,occupation,education_level,marital_status,has_loan,risk_preference,reaction_to_loss,financial_knowledge_level,...,risk_preference_num,reaction_to_loss_num,financial_knowledge_num,saving_frequency,investment_frequency,has_insurance,financial_goal,time_horizon,risk_rating,risk_rating_num
0,45-54,">70,000",Male,Unemployed,PhD,Divorced,1,Low Risk,Sell everything,Very poor,...,0,,0,Monthly,Regularly,Yes,Retirement,More than 10 years,Low,0
1,55-64,"30,000–50,000",Female,Employed,Bachelor's,Widowed,1,Low Risk,Sell everything,Very poor,...,0,,0,Weekly,Occasionally,Yes,Retirement,3 - 10 years,Medium,1
2,18-24,"50,000–70,000",Non-binary,Employed,Master's,Single,1,Low Risk,Hold,Beginner,...,0,,1,Monthly,Regularly,No,General wealth building,More than 10 years,Medium,1
3,55-64,"<30,000",Male,Unemployed,Bachelor's,Single,1,Low Risk,Invest more,Advanced,...,0,,3,Rarely,Regularly,Yes,Retirement,More than 10 years,Medium,1
4,25-34,"30,000–50,000",Non-binary,Unemployed,Bachelor's,Widowed,1,Low Risk,Sell everything,Intermediate,...,0,,2,Monthly,Regularly,Yes,Travel,More than 10 years,Low,0


In [215]:
# Concatenate
final_df = pd.concat([kaggle_df_renamed, survey_df_renamed], ignore_index=True)

NameError: name 'survey_df_renamed' is not defined

In [None]:
final_df.head()

In [None]:
final_df.to_csv(r"C:\Users\manod\Desktop\Capstone\Data\final_dataset ffs.csv", index=False)