In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = '/content/Copy of CMH_OGS(1).csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

# Drop rows where the target variable is missing
df.dropna(subset=[df.columns[28]], inplace=True)  # Use the 28th column as the target

# Identify features and target
X = df.drop(df.columns[28], axis=1)  # Exclude the 28th column from features
y = df.iloc[:, 28]

# Encode target if it is categorical
le = None
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Convert non-string categorical columns to strings
for col in categorical_cols:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Preprocessing for numerical data with PCA
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))  # Keep 95% of variance
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Train the model
clf.fit(X_train, y_train)

# Save the pipeline and label encoder to a .pkl file
joblib.dump({'pipeline': clf, 'label_encoder': le}, 'model_pipeline.pkl')

# Load the pipeline from the .pkl file to ensure it's saved correctly
loaded_objects = joblib.load('model_pipeline.pkl')
loaded_clf = loaded_objects['pipeline']
loaded_le = loaded_objects['label_encoder']

# Make predictions with the loaded model
y_pred = loaded_clf.predict(X_test)

# Evaluate the loaded model
accuracy = accuracy_score(y_test, y_pred)
print(f'Loaded Model Accuracy: {accuracy:.2f}')
