In [3]:
import pandas as pd

# 1. Load
data = pd.read_csv(r"C:\Users\lenovo\Desktop\introtalent\Python\Data Files used in Projects\Investment.csv")

# 2. Identify numeric-but-categorical columns
suspect_cols = [
    col for col 
    in data.select_dtypes(include=["int64", "float64"]).columns
    if data[col].nunique() < 20
]
print("Suspect categorical columns:", suspect_cols)

# 3. Convert them to category
for col in suspect_cols:
    data[col] = data[col].astype("category")

# 4. Drop missing rows (or apply imputation as needed)
data = data.dropna()

# 5. Encode features
X = pd.get_dummies(data.drop("Invested", axis=1), drop_first=True)

# 6. Encode target
data["Investment"] = data["Invested"].map({"Yes": 1, "No": 0})
y = data["Investment"]


Suspect categorical columns: ['previous', 'emp_var_rate', 'nr_employed']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [5]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2%}")

# Full metrics
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Model Accuracy: 91.21%

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7295
           1       0.66      0.47      0.55       943

    accuracy                           0.91      8238
   macro avg       0.80      0.72      0.75      8238
weighted avg       0.90      0.91      0.91      8238



# ChatGPT Code

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [8]:
# Load data
ds = pd.read_csv(r'C:\Users\lenovo\Desktop\introtalent\Python\Data Files used in Projects\Investment.csv')

In [9]:
# Target encoding
ds['Invested'] = ds['Invested'].map({'Yes': 1, 'No': 0})

In [10]:
# Remove 'duration' to avoid data leakage
ds.drop(columns=['duration'], inplace=True)

In [11]:
# Transform 'pdays': 999 -> 'never', others -> 'contacted'
ds['pdays'] = ds['pdays'].apply(lambda x: 'never' if x == 999 else 'contacted')

In [12]:
# Bin 'previous' into categories: '0', '1', '2+'
ds['previous'] = ds['previous'].apply(lambda x: '0' if x == 0 else ('1' if x == 1 else '2+'))

In [13]:
# Bin 'campaign' into: '1', '2', '3+'
ds['campaign'] = ds['campaign'].apply(lambda x: '1' if x == 1 else ('2' if x == 2 else '3+'))

In [14]:
# Define features and target
X = ds.drop('Invested', axis=1)
y = ds['Invested']

In [15]:
# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

In [16]:
# Label encode categorical features
le = LabelEncoder()
for col in cat_cols:
    X[col] = le.fit_transform(X[col])

In [17]:
# Standard scale numerical features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [20]:
# Predict and evaluate
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2%}")
print("\nClassification Report:")
print(report)

Model Accuracy: 89.17%

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7310
           1       0.54      0.28      0.37       928

    accuracy                           0.89      8238
   macro avg       0.73      0.62      0.65      8238
weighted avg       0.87      0.89      0.88      8238

