In [None]:
import numpy as np   
import pandas as pd  
import matplotlib.pyplot as plt  
import seaborn as sns

In [None]:
file_path=(r"C:\Users\mdirf\Downloads\Random Forest (1)\Random Forest\glass.xlsx")
df=pd.read_excel(file_path,sheet_name=1)
print(df)

In [None]:
print(df.info())
print(df.isnull().sum())
print(df.describe())
print("Number of duplicate rows:", df.duplicated().sum())
df=df.drop_duplicates()
print(df.info())

In [None]:
# Correlation matrix
corr = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#  Boxplots to detect outliers
plt.figure(figsize=(15, 8))
df.boxplot()
plt.title("Boxplot for Outlier Detection")
plt.show()

In [None]:
from scipy.stats import zscore
import numpy as np

z_scores = np.abs(zscore(df.select_dtypes(include=['int64', 'float64'])))
outliers = (z_scores > 3).any(axis=1)
print("Number of outliers detected:", outliers.sum())


In [None]:
# 8. Pairplot for feature relationships
sns.pairplot(df, hue='Type', diag_kind='hist', corner=True)
plt.show()

In [None]:
# Histograms for all numerical features
df.hist(figsize=(12, 10), bins=20, color='skyblue')
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.show()



In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [79]:
# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Define imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', num_imputer, num_cols),
    ('cat', Pipeline(steps=[
        ('imputer', cat_imputer),
        ('onehot', OneHotEncoder(drop='first', dtype=float))
    ]), cat_cols)
])

df_processed_array = preprocessor.fit_transform(df)

all_features = preprocessor.get_feature_names_out()

df_processed = pd.DataFrame(df_processed_array, columns=all_features)

# Feature scaling
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_processed), columns=all_features)

# Handle class imbalance if 'target' exists
# if 'target' in df.columns:
#     X = df_scaled
#     y = df['target']
#     smote = SMOTE(random_state=42)
#     X_res, y_res = smote.fit_resample(X, y)
    
#     print("Original class distribution:\n", y.value_counts())
#     print("Resampled class distribution:\n", y_res.value_counts())
# else:
#     X_res = df_scaled
#     y_res = None


In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Assuming X_res and y_res are the processed features and target from previous steps
# If target is not resampled, use X_scaled and y

if y_res is not None:
    X_final, y_final = X_res, y_res
else:
    X_final, y_final = X_res, df['Type']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        15
           3       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         6

    accuracy                           1.00        43
   macro avg       1.00      1.00      1.00        43
weighted avg       1.00      1.00      1.00        43



In [82]:
# Define target column
target_col = 'Type'

# Features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# if you want to balance classes using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Now do train-test split safely
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)


In [83]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [84]:
# Boosting

boosting_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42
)
boosting_model.fit(X_train, y_train)
y_pred_boost = boosting_model.predict(X_test)

print("Boosting Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_boost))
print("Precision:", precision_score(y_test, y_pred_boost, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_boost, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_boost, average='weighted'))
print(classification_report(y_test, y_pred_boost))


Boosting Results:
Accuracy: 0.8913043478260869
Precision: 0.9012376689806357
Recall: 0.8913043478260869
F1 Score: 0.8926734778383048
              precision    recall  f1-score   support

           1       0.67      0.80      0.73        15
           2       0.93      0.87      0.90        15
           3       0.92      0.73      0.81        15
           5       1.00      0.94      0.97        16
           6       1.00      1.00      1.00        16
           7       0.88      1.00      0.94        15

    accuracy                           0.89        92
   macro avg       0.90      0.89      0.89        92
weighted avg       0.90      0.89      0.89        92



In [None]:
# bagging

bagging_model = BaggingClassifier(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42), 
    n_estimators=10,
    random_state=42
)
bagging_model.fit(X_train, y_train)
y_pred_bag = bagging_model.predict(X_test)

print("Bagging Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_bag))
print("Precision:", precision_score(y_test, y_pred_bag, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_bag, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_bag, average='weighted'))
print(classification_report(y_test, y_pred_bag))

Bagging Results:
Accuracy: 0.8913043478260869
Precision: 0.8959627329192547
Recall: 0.8913043478260869
F1 Score: 0.890432227678927
              precision    recall  f1-score   support

           1       0.75      0.80      0.77        15
           2       0.92      0.73      0.81        15
           3       0.93      0.87      0.90        15
           5       0.94      0.94      0.94        16
           6       1.00      1.00      1.00        16
           7       0.83      1.00      0.91        15

    accuracy                           0.89        92
   macro avg       0.89      0.89      0.89        92
weighted avg       0.90      0.89      0.89        92



In [None]:
# 1. Bagging (Bootstrap Aggregating)

# Bagging is an ensemble method that reduces variance by training multiple models on different random subsets 
# of the data (with replacement) and then aggregating their predictions.

# How it works:
# Randomly sample subsets of the training data with replacement (bootstrap samples).
# Train a base model (like Decision Tree or Random Forest) on each subset.
# Aggregate the predictions (majority vote for classification, average for regression).

# Goal: Reduce overfitting and increase stability.
# Example: Random Forest is a bagging method where multiple decision trees are trained on different samples.

In [None]:
# Boosting

# Boosting is an ensemble method that reduces bias by sequentially training models. Each model focuses on the errors made by the previous models.

# How it works:
# Train a weak learner (like a shallow tree).
# Identify the data points the model predicted incorrectly.
# Train the next model giving more weight to these misclassified points.
# Combine all modelsâ€™ predictions with weighted voting (classification) or weighted sum (regression).

# Goal: Improve model accuracy by focusing on mistakes.
# Example: Gradient Boosting, AdaBoost, XGBoost.

In [None]:
# 2. Handling Imbalanced Data

# Imbalanced data occurs when one class significantly outnumbers others (e.g., 90% vs 10%). 
# This can make models biased toward the majority class.

# Methods to handle imbalance
# Resampling Techniques
# Class Weights
# Anomaly Detection / Specialized Metrics
# Ensemble methods