In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import streamlit as st
import pickle

# Loading  dataset
file_path = 'Industry_accident_dataset.csv'  # Replace with your file path
df = pd.read_csv(file_path)




# Clean the column names
df.columns = df.columns.str.strip()

# Extract relevant columns
df = df[['Company', 'Month', 'Accident type']]

# Add a column for the number of accidents
df['Accident Count'] = 1

# Group by Company and Month to get the number of accidents
df_grouped = df.groupby(['Company', 'Month']).size().reset_index(name='Accident Count')

# Encode categorical variables
label_encoder_company = LabelEncoder()
label_encoder_month = LabelEncoder()

df_grouped['Company'] = label_encoder_company.fit_transform(df_grouped['Company'])
df_grouped['Month'] = label_encoder_month.fit_transform(df_grouped['Month'])

# Standardize the features
scaler = StandardScaler()
df_grouped[['Company', 'Month']] = scaler.fit_transform(df_grouped[['Company', 'Month']])

# Features and target
X = df_grouped[['Company', 'Month']]
y = df_grouped['Accident Count']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and hyperparameter tuning
# Use GridSearchCV to find the best model and hyperparameters
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor()
}

params = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Finding the best model
best_model = None
best_mse = float('inf')
for model_name in models:
    model = GridSearchCV(models[model_name], params[model_name], scoring='neg_mean_squared_error', cv=5)
    model.fit(X_train, y_train)
    if -model.best_score_ < best_mse:
        best_mse = -model.best_score_
        best_model = model.best_estimator_

# Save the trained model to a pickle file
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Save the label encoders and scaler
with open('label_encoder_company.pkl', 'wb') as le_company_file:
    pickle.dump(label_encoder_company, le_company_file)

with open('label_encoder_month.pkl', 'wb') as le_month_file:
    pickle.dump(label_encoder_month, le_month_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Load the model, encoders, and scaler (just for verification)
# with open('best_model.pkl', 'rb') as model_file:
#     loaded_model = pickle.load(model_file)

# with open('label_encoder_company.pkl', 'rb') as le_company_file:
#     loaded_label_encoder_company = pickle.load(le_company_file)

# with open('label_encoder_month.pkl', 'rb') as le_month_file:
#     loaded_label_encoder_month = pickle.load(le_month_file)

# with open('scaler.pkl', 'rb') as scaler_file:
#     loaded_scaler = pickle.load(scaler_file)

# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
st.write(f'Mean Squared Error: {mse}')

# Streamlit interface
st.title('Accident Prediction App')

# Get list of unique companies and months
unique_companies = df['Company'].unique()
unique_months = df['Month'].unique()

# Company selection
selected_company = st.selectbox('Select Company:', unique_companies)
encoded_company = label_encoder_company.transform([selected_company])[0]
encoded_company = scaler.transform([[encoded_company, 0]])[0][0]

# Month selection
selected_month = st.selectbox('Select Month:', unique_months)
encoded_month = label_encoder_month.transform([selected_month])[0]
encoded_month = scaler.transform([[0, encoded_month]])[0][1]

# Predict the number of accidents
input_data = [[encoded_company, encoded_month]]
predicted_accidents = best_model.predict(input_data)[0]

# Display the prediction
st.write(f'Predicted number of accidents for {selected_company} in {selected_month}: {predicted_accidents:.2f}')

# Visualize actual vs predicted values
y_pred = best_model.predict(X_test)
st.write('Actual vs Predicted Number of Accidents')
st.line_chart({'Actual': y_test.values, 'Predicted': y_pred})


KeyError: "None of [Index(['Company', 'Month', 'Accident type'], dtype='object')] are in the [columns]"