In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset from the specified path
url = r"/content/50_Startups.csv"  # Fixing the path issue using raw string
df = pd.read_csv(url)

# Create target variable: Invest if profit > 120000
df['Invest'] = (df['Profit'] > 120000).astype(int)

# Encode categorical features ('State' column)
le = LabelEncoder()
df['State'] = le.fit_transform(df['State'])

# Feature matrix and target
X = df[['R&D Spend', 'Administration', 'Marketing Spend', 'State']]
y = df['Invest']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
acc_log = accuracy_score(y_test, y_pred_log)

# Random Forest (no scaling needed)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Output results
print("Logistic Regression Accuracy: {:.2f}%".format(acc_log * 100))
print("Random Forest Accuracy: {:.2f}%".format(acc_rf * 100))

# Compare and print best
if acc_log > acc_rf:
    print("Logistic Regression is better for this prediction task.")
elif acc_rf > acc_log:
    print("Random Forest is better for this prediction task.")
else:
    print("Both models perform equally well.")


Logistic Regression Accuracy: 80.00%
Random Forest Accuracy: 100.00%
Random Forest is better for this prediction task.


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset from the specified path
url = r"/content/50_Startups.csv"  # Ensure correct file path
df = pd.read_csv(url)

# Create target variable: Invest if profit > 120000
df['Invest'] = (df['Profit'] > 120000).astype(int)

# Check the class distribution
print(df['Invest'].value_counts())

# Encode categorical features ('State' column)
le = LabelEncoder()
df['State'] = le.fit_transform(df['State'])

# Feature matrix and target
X = df[['R&D Spend', 'Administration', 'Marketing Spend', 'State']]
y = df['Invest']

# Train/test split with shuffle=True to ensure randomness
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Feature scaling for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
acc_log = accuracy_score(y_test, y_pred_log)

# Random Forest (no scaling needed)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Cross-validation to check model's stability
log_reg_cv_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=5)
rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=5)

# Output results
print("Logistic Regression Accuracy: {:.2f}%".format(acc_log * 100))
print("Random Forest Accuracy: {:.2f}%".format(acc_rf * 100))

# Cross-validation results
print("Logistic Regression Cross-Validation Accuracy: {:.2f}%".format(log_reg_cv_scores.mean() * 100))
print("Random Forest Cross-Validation Accuracy: {:.2f}%".format(rf_cv_scores.mean() * 100))

# Compare and print best
if acc_log > acc_rf:
    print("Logistic Regression is better for this prediction task.")
elif acc_rf > acc_log:
    print("Random Forest is better for this prediction task.")
else:
    print("Both models perform equally well.")


Invest
0    30
1    20
Name: count, dtype: int64
Logistic Regression Accuracy: 80.00%
Random Forest Accuracy: 100.00%
Logistic Regression Cross-Validation Accuracy: 90.00%
Random Forest Cross-Validation Accuracy: 97.50%
Random Forest is better for this prediction task.
