In [1]:
import pandas as pd
df=pd.read_csv('cimsDB.csv')
unique_formateurs = df['formateur'].unique()
print(unique_formateurs)

['Noor Al-Harbi' 'Hassan Al-Subhi' 'Amin Al-Juaied' 'Aisha Al-Shammari'
 'Fatima Al-Jamal' 'Omar Al-Ghanim' 'Mohammed Al-Badrani'
 'Youssef Al-Abdullah' 'Sara Al-Otaibi' 'Layla Al-Khalid']


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline




from sklearn.metrics import mean_squared_error, r2_score



# Assuming df is your DataFrame
all_features = ['theme', 'formateur', 'knowledge', 'professionalism', 'communication',
                'relevence', 'structure', 'duration', 'clarity', 'environment', 'typef']

# Define the columns to be one-hot encoded
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ['theme', 'formateur']
numerical_features = [feature for feature in all_features if feature not in categorical_features]

# Create a ColumnTransformer with OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep the rest of the features as they are
)

# Ensure 'stars_rating' and 'overall_score' are present in df
assert 'stars_rating' in df.columns, "Column 'stars_rating' not found in df"
assert 'overall_score' in df.columns, "Column 'overall_score' not found in df"

from sklearn.model_selection import train_test_split
# Separate target variables
y_stars_rating = df['stars_rating']
y_overall_score = df['overall_score']
# Create X by selecting all_features from df
X = df[all_features]
# Pipeline for stars_rating
model_stars_rating = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Example parameters for regression
])
# Pipeline for overall_score
model_overall_score = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Example parameters for regression
])
# Split data into training and testing sets for stars_rating
X_train_sr, X_test_sr, y_train_sr, y_test_sr = train_test_split(X, y_stars_rating, test_size=0.2, random_state=42)
# Train the stars_rating model
model_stars_rating.fit(X_train_sr, y_train_sr)


from sklearn.metrics import mean_squared_error, r2_score
# Predict and evaluate for stars_rating
y_pred_sr = model_stars_rating.predict(X_test_sr)
print("Stars Rating Model:")
print("R2 Score:", r2_score(y_test_sr, y_pred_sr))
print("Mean Squared Error:", mean_squared_error(y_test_sr, y_pred_sr))
# Split data into training and testing sets for overall_score
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X, y_overall_score, test_size=0.2, random_state=42)
# Train the overall_score model
model_overall_score.fit(X_train_os, y_train_os)
# Predict and evaluate for overall_score
y_pred_os = model_overall_score.predict(X_test_os)
print("\nOverall Score Model:")
print("R2 Score:", r2_score(y_test_os, y_pred_os))
print("Mean Squared Error:", mean_squared_error(y_test_os, y_pred_os))
# Features to be used during prediction
prediction_features = ['formateur', 'duration', 'knowledge', 'typef']
# Example new data for prediction
new_data = pd.DataFrame({
    'formateur': ['Hassan Al-Subhi', 'Noor Al-Harbi'],
    'duration': [2, 3],
    'knowledge': [4, 3],
    'theme': ['A', 'B'],
    'typef': [0, 1]  # Assuming typef is binary (0 or 1)
})
# Assign mean values to the missing columns individually (except 'typef')
if 'professionalism' not in new_data.columns:
    new_data['professionalism'] =  2.2000666888962987

if 'communication' not in new_data.columns:
    new_data['communication'] = 2.2147382460820273


if 'relevence' not in new_data.columns:
    new_data['relevence'] = 1.9896632210736913

if 'structure' not in new_data.columns:
    new_data['structure'] = 1.991997332444148


if 'clarity' not in new_data.columns:
    new_data['clarity'] =2.0106702234078027

if 'environment' not in new_data.columns:
    new_data['environment'] =  2.2517505835278424
# Ensure new data matches the format of training data
new_data_all_features = new_data[all_features]
# Predict on new data for stars_rating
new_predictions_sr = model_stars_rating.predict(new_data_all_features)
print("\nPredictions for new data (Stars Rating):\n", new_predictions_sr)
# Predict on new data for overall_score
new_predictions_os = model_overall_score.predict(new_data_all_features)
print("Predictions for new data (Overall Score):\n", new_predictions_os)


Stars Rating Model:
R2 Score: 0.6022327382240835
Mean Squared Error: 1.05272973010708

Overall Score Model:
R2 Score: 0.7157770813360589
Mean Squared Error: 0.18425145456443687

Predictions for new data (Stars Rating):
 [1.53 3.61]
Predictions for new data (Overall Score):
 [2.69 2.35]


In [3]:
from joblib import dump
dump(model_stars_rating, 'model_stars_rating.joblib')
dump(model_overall_score, 'model_overall_score.joblib')

['model_overall_score.joblib']

In [4]:
from joblib import dump
dump(model_stars_rating, 'model_stars_rating.joblib')

['model_stars_rating.joblib']

In [5]:
import sklearn
print(sklearn.__version__)



1.4.2
