In [3]:
import snowflake.connector
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle

try:
    # Connect to Snowflake using environment variables
    conn = snowflake.connector.connect(
        user="kaustubh",
        password="@@Kaustubh123",
        account="rh57156.central-india.azure",
        warehouse="COMPUTE_WH",
        database="Main_Project",
        schema="Main_Project_Schema_Mart",
        role="ACCOUNTADMIN"
    )

    # Print connection success message
    print("Connected to Snowflake successfully!")

    # Now, you can perform further operations with snowflake_conn
except snowflake.connector.errors.DatabaseError as e:
    # Print connection failure message
    print(f"Failed to connect to Snowflake: {e}")

cur = conn.cursor()

# Execute SQL query to fetch data
sql_query = f"SELECT * FROM TOTALINFO"
cur.execute(sql_query)

# Fetch data into a pandas DataFrame
data = cur.fetchall()
columns = [col[0] for col in cur.description]
df = pd.DataFrame(data, columns=columns)

# Close cursor and connection
cur.close()
conn.close()

# Preprocess the DataFrame as needed
df['SCOREACHIEVEDINQUIZ'] = df['SCOREACHIEVEDINQUIZ'].fillna(df['SCOREACHIEVEDINQUIZ'].mean()).astype(int)
df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'] = df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].fillna(df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].mean()).astype(int)
df['SPECIALIZATION'] = df['SPECIALIZATION'].str.replace(r"[\[\],]", '', regex=True)
df1 = df[['GENDER', 'SPECIALIZATION', 'EXPERIENCE', 'DIFFICULTYLEVEL', 'TRAININGNAME', 'OPTIMIZEDDURATION', 'SCOREACHIEVEDINQUIZ', 'ASSESSMENT_COMPLETION_TIME_IN_HOURS']]

# Separate features and target variable
X = df1[['GENDER', 'SPECIALIZATION', 'EXPERIENCE', 'DIFFICULTYLEVEL', 'TRAININGNAME', 'OPTIMIZEDDURATION']]
y = df1['ASSESSMENT_COMPLETION_TIME_IN_HOURS']
# Encode categorical variables
label_encoders = {}
for col in ['GENDER', 'SPECIALIZATION', 'DIFFICULTYLEVEL', 'TRAININGNAME']:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])
    label_encoders[col] = le

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_imputed, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_imputed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")

