In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the data
data = pd.read_csv('ai_event_prediction.csv')

# Strip extra spaces from column names
data.columns = data.columns.str.strip()

# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Handle missing values (if any)
data.fillna(method='ffill', inplace=True)  # Forward fill for simplicity

# Convert categorical columns to numerical format using Label Encoding
label_encoder_location = LabelEncoder()
data['Location'] = label_encoder_location.fit_transform(data['Location'])
label_encoder_weather = LabelEncoder()
data['Weather Conditions'] = label_encoder_weather.fit_transform(data['Weather Conditions'])

# Normalize/scale the numerical columns
scaler = StandardScaler()
numerical_columns = data.columns.drop(['Date', 'Event-Specific Factors'])
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Categorize the target variable into three classes
# Example rules (these should be adjusted based on the actual data)
data['Event-Specific Factors'] = pd.cut(
    data['Event-Specific Factors'],
    bins=[-float('inf'), 0.33, 0.66, float('inf')],
    labels=['huge population attended', 'small population attended', 'moderate population attended']
)

# Split the data into features and target
X = data.drop(['Event-Specific Factors', 'Date'], axis=1)
y = data['Event-Specific Factors']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define class weights
class_weights = {'huge population attended': 0.7, 'moderate population attended': 0.7, 'small population attended': 0.7}

# Train a RandomForestClassifier with class weights
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using cross-validation
scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
accuracy = scores.mean()

print(f'Cross-validated Accuracy: {accuracy:.2f}')

# Save the model and preprocessing objects
joblib.dump(model, 'event_prediction_model.pkl')
joblib.dump(label_encoder_location, 'label_encoder_location.pkl')
joblib.dump(label_encoder_weather, 'label_encoder_weather.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')


  data.fillna(method='ffill', inplace=True)  # Forward fill for simplicity


Cross-validated Accuracy: 0.33


['feature_columns.pkl']