In [25]:
import snowflake.connector
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle

try:
    # Connect to Snowflake using environment variables
    conn = snowflake.connector.connect(
        user="kaustubh",
        password="@@Kaustubh123",
        account="rh57156.central-india.azure",
        warehouse="COMPUTE_WH",
        database="Main_Project",
        schema="Main_Project_Schema_Mart",
        role="ACCOUNTADMIN"
    )

    # Print connection success message
    print("Connected to Snowflake successfully!")

    # Now, you can perform further operations with snowflake_conn
except snowflake.connector.errors.DatabaseError as e:
    # Print connection failure message
    print(f"Failed to connect to Snowflake: {e}")

cur = conn.cursor()

# Execute SQL query to fetch data
sql_query = f"SELECT * FROM TOTALINFO"
cur.execute(sql_query)

# Fetch data into a pandas DataFrame
data = cur.fetchall()
columns = [col[0] for col in cur.description]
df = pd.DataFrame(data, columns=columns)

# Close cursor and connection
cur.close()
conn.close()

# Preprocess the DataFrame as needed
df['SCOREACHIEVEDINQUIZ'] = df['SCOREACHIEVEDINQUIZ'].fillna(df['SCOREACHIEVEDINQUIZ'].mean()).astype(int)
df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'] = df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].fillna(df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].mean()).astype(int)
df['SPECIALIZATION'] = df['SPECIALIZATION'].str.replace(r"[\[\],]", '', regex=True)
df1 = df[['GENDER', 'SPECIALIZATION', 'EXPERIENCE', 'DIFFICULTYLEVEL', 'TRAININGNAME', 'OPTIMIZEDDURATION', 'SCOREACHIEVEDINQUIZ', 'ASSESSMENT_COMPLETION_TIME_IN_HOURS']]

# Separate features and target variable
X = df1[['GENDER', 'SPECIALIZATION', 'EXPERIENCE', 'DIFFICULTYLEVEL', 'TRAININGNAME', 'OPTIMIZEDDURATION']]
y = df1['ASSESSMENT_COMPLETION_TIME_IN_HOURS']

# Encode categorical variables
X_encoded = pd.get_dummies(X)

# Handle NaN values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

# Train the Random Forest Regressor model
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(X_imputed, y)

# Save the trained model as a pickle file
filename = 'random_forest_regressor_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(random_forest_regressor, file)

print(f"Trained model saved as {filename}.")


Connected to Snowflake successfully!
Trained model saved as random_forest_regressor_model.pkl.
