 **Task 8: Feature Engineering & Model Tuning + Fraud Detection**

In [1]:
!pip install scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    mean_squared_error, r2_score,
    precision_score, recall_score, f1_score
)

# ==========================
# PART 1: Feature Engineering & Model Tuning (Student Scores Dataset)
# ==========================

print("===== Part 1: Feature Engineering & Model Tuning =====")

# Upload dataset
from google.colab import files
print(" Please upload student_scores.csv")
uploaded = files.upload()

# Load dataset
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

print("\nDataset Sample:\n", df.head())

# Check missing values
print("\nMissing Values:\n", df.isnull().sum())



# Features and target
X = df[['Hours']] # Use 'Hours' as the feature
y = df['Scores'] # Use 'Scores' as the target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Linear Regression
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2 = r2_score(y_test, y_pred_lr)
print(f"\nLinear Regression Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

# Random Forest with GridSearchCV
rf_model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best Model
best_model = grid_search.best_estimator_
y_pred_rf = best_model.predict(X_test)

print("\nBest Parameters from GridSearchCV:", grid_search.best_params_)
print(f"Random Forest - Optimized R²: {r2_score(y_test, y_pred_rf):.2f}")

# ==========================
#  PART 2: Fraud Detection with Decision Trees
# ==========================

print("\n\n===== Part 2: Fraud Detection with Decision Trees =====")

print(" Please upload fraud_detection.csv")
uploaded = files.upload()

# Load fraud dataset
file_name = list(uploaded.keys())[0]
df_fraud = pd.read_csv(file_name)

print("\nFraud Dataset Sample:\n", df_fraud.head())

# Check missing values
print("\nMissing Values:\n", df_fraud.isnull().sum())

# Encode categorical variable (TransactionType)
encoder = LabelEncoder()
df_fraud['TransactionType'] = encoder.fit_transform(df_fraud['TransactionType'])

# Features and target
X = df_fraud[['Amount', 'TransactionType']]
y = df_fraud['Fraud']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Evaluate Decision Tree
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print("\nDecision Tree Performance:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

===== Part 1: Feature Engineering & Model Tuning =====
 Please upload student_scores.csv


Saving student_scores.csv to student_scores.csv

Dataset Sample:
    Hours  Scores
0    2.5      21
1    5.1      47
2    3.2      27
3    8.5      75
4    3.5      30

Missing Values:
 Hours     0
Scores    0
dtype: int64

Training samples: 20, Testing samples: 5

Linear Regression Performance:
RMSE: 4.35
R²: 0.97

Best Parameters from GridSearchCV: {'max_depth': None, 'n_estimators': 50}
Random Forest - Optimized R²: 0.98


===== Part 2: Fraud Detection with Decision Trees =====
 Please upload fraud_detection.csv


Saving fraud_detection.csv to fraud_detection.csv

Fraud Dataset Sample:
    TransactionID  CustomerID       Amount TransactionType     Location  \
0              1        4174  3623.044171      Withdrawal      Houston   
1              2        4507          NaN         Deposit        Miami   
2              3        1860  8760.570892        Purchase      Houston   
3              4        2294  7833.768690        Purchase        Miami   
4              5        2130  1689.499795        Purchase  Los Angeles   

  DeviceType  TimeOfDay  PreviousFraud  TransactionSpeed  Fraud  
0     Laptop    Morning              0         18.002612      0  
1     Mobile    Morning              0         20.276396      0  
2        ATM    Morning              0         39.389780      0  
3     Laptop    Morning              0         58.910737      0  
4     Mobile  Afternoon              0         13.060114      0  

Missing Values:
 TransactionID         0
CustomerID            0
Amount             