# Data Loading

In [None]:
import pandas as pd

# Dataset URL
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"

# Load the dataset into a Pandas DataFrame
glass_df = pd.read_csv(url)

# Display the first few rows of the dataset to get an overview
print(glass_df.head())


# Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset into a Pandas DataFrame
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
glass_df = pd.read_csv(url)

# Display basic information about the dataset
print("Dataset Info:")
print(glass_df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(glass_df.describe())

# Check for missing values
print("\nMissing Values:")
print(glass_df.isnull().sum())

# Visualize the distribution of the target variable (Type of glass)
plt.figure(figsize=(8, 6))
sns.countplot(x="Type of glass", data=glass_df)
plt.title("Distribution of Glass Types")
plt.show()


# Data Preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset into a Pandas DataFrame
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
glass_df = pd.read_csv(url)

# Check for missing values
print("Missing Values:")
print(glass_df.isnull().sum())

# Handle missing values (if any)
# In this case, let's use SimpleImputer to fill missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
glass_df_imputed = pd.DataFrame(imputer.fit_transform(glass_df), columns=glass_df.columns)

# Check for outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=glass_df_imputed, orient="h")
plt.title("Boxplot of Features (After Imputation)")
plt.show()

# Feature scaling using StandardScaler
scaler = StandardScaler()
glass_df_scaled = pd.DataFrame(scaler.fit_transform(glass_df_imputed.iloc[:, 1:-1]), columns=glass_df.columns[1:-1])

# Display the first few rows of the scaled dataset
print("\nScaled Dataset:")
print(glass_df_scaled.head())


# Data Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset into a Pandas DataFrame
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
glass_df = pd.read_csv(url)

# Visualize the distribution of each feature in relation to the target variable
plt.figure(figsize=(16, 12))
for i, column in enumerate(glass_df.columns[1:-1]):
    plt.subplot(3, 3, i + 1)
    sns.histplot(data=glass_df, x=column, hue="Type of glass", kde=True, bins=20, palette="viridis")
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Correlation matrix
correlation_matrix = glass_df.corr()

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

# Pair plot
sns.pairplot(glass_df, hue="Type of glass", palette="viridis")
plt.suptitle("Pair Plot of Features", y=1.02)
plt.show()


# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Load the dataset into a Pandas DataFrame
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
glass_df = pd.read_csv(url)

# Separate features (X) and target variable (y)
X = glass_df.iloc[:, 1:-1]  # Features (excluding the 'Id' and 'Type of glass' columns)
y = glass_df['Type of glass']  # Target variable

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


# Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


# Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model on the training set
rf_classifier.fit(X_train, y_train)


# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=glass_df['Type of glass'].unique(), yticklabels=glass_df['Type of glass'].unique())
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Evaluate the best model on the testing set
y_pred_best = best_rf_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

# Display the best parameters and accuracy
print("Best Parameters:", best_params)
print(f"Accuracy with Best Model: {accuracy_best:.2f}")
