In [8]:
import pandas as pd
import numpy as np

# ---------------- LOAD DATA -----------------
df = pd.read_csv("/unclean_financial_stress_data.csv")

print("Before Cleaning:")
print(df.head())
print(df.info())

# --------------- STEP 1: REPLACE INVALID ENTRIES -----------------

# Replace '?' with NaN
df.replace("?", np.nan, inplace=True)

# Convert FX_Rate to numeric
df["FX_Rate"] = pd.to_numeric(df["FX_Rate"], errors="coerce")

# Convert Stress_Level to numeric
df["Stress_Level"] = pd.to_numeric(df["Stress_Level"], errors="coerce")

# --------------- STEP 2: HANDLE NEGATIVE VALUES -----------------

# Negative unemployment is invalid
df.loc[df["Unemployment_Rate"] < 0, "Unemployment_Rate"] = np.nan

# Negative portfolio exposure is invalid
df.loc[df["Portfolio_Exposure"] < 0, "Portfolio_Exposure"] = np.nan

# --------------- STEP 3: HANDLE OUTLIERS -----------------

# Define outlier removal using IQR
def remove_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where((df[col] < lower) | (df[col] > upper), np.nan, df[col])

numeric_cols = [
    "GDP_Growth","Inflation","Interest_Rate","Unemployment_Rate",
    "Market_Return","FX_Rate","Oil_Price","Portfolio_Exposure","Market_Volatility"
]

for col in numeric_cols:
    remove_outliers(col)

# --------------- STEP 4: HANDLE MISSING VALUES -----------------

# Fill numeric missing values with column mean
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Fill missing stress levels with mode
df["Stress_Level"].fillna(df["Stress_Level"].mode()[0], inplace=True)

# --------------- STEP 5: FINAL CLEANUP -----------------

# Ensure correct data types
df = df.astype({
    "GDP_Growth": float,
    "Inflation": float,
    "Interest_Rate": float,
    "Unemployment_Rate": float,
    "Market_Return": float,
    "FX_Rate": float,
    "Oil_Price": float,
    "Portfolio_Exposure": float,
    "Market_Volatility": float,
    "Stress_Level": int
})

print("\nAfter Cleaning:")
print(df.head())
print(df.info())

# --------------- STEP 6: SAVE CLEANED DATASET -----------------
df.to_csv("cleaned_financial_stress_data.csv", index=False)
print("\nCleaned dataset saved as cleaned_financial_stress_data.csv")


Before Cleaning:
   GDP_Growth  Inflation  Interest_Rate  Unemployment_Rate  Market_Return  \
0    4.331587   4.133137       6.693578           8.141387            NaN   
1    3.715279   5.202744       4.170217           5.974250       0.938229   
2    1.454600   2.975247       6.288859           7.551565       6.225925   
3    2.991616   4.160399       6.653176           6.221437       0.273995   
4    3.621336   2.869525       3.547584           8.297059       6.644017   

             FX_Rate  Oil_Price  Portfolio_Exposure  Market_Volatility  \
0  88.05300890437982  87.059600              101244          21.234437   
1  81.43066564704242  87.229868               95413                NaN   
2  81.44232015450133  97.584729               43803          19.173247   
3  86.70203017130547  74.468907               -5000          18.703803   
4  86.73515605712606  73.939709               67906          21.412377   

  Stress_Level  
0          NaN  
1            1  
2            ?  
3      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Stress_Level"].fillna(df["Stress_Level"].mode()[0], inplace=True)


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df = pd.read_csv("/cleaned_financial_stress_data.csv")

X = df.drop("Stress_Level", axis=1)
y = df["Stress_Level"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", acc)


Logistic Regression Accuracy: 0.775


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("/cleaned_financial_stress_data.csv")

X = df.drop("Stress_Level", axis=1)
y = df["Stress_Level"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Random Forest Accuracy:", acc)


Random Forest Accuracy: 0.7


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("/cleaned_financial_stress_data.csv")

X = df.drop("Stress_Level", axis=1)
y = df["Stress_Level"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("XGBoost Accuracy:", acc)


XGBoost Accuracy: 0.65


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

df = pd.read_csv("/content/cleaned_financial_stress_data.csv")

X = df.drop("Stress_Level", axis=1)
y = df["Stress_Level"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("SVM Accuracy:", acc)


SVM Accuracy: 0.775
