In [48]:
import pandas as pd
import snowflake.connector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import pickle

try:
    # Connect to Snowflake using environment variables
    conn = snowflake.connector.connect(
        user="kaustubh",
        password="@@Kaustubh123",
        account="rh57156.central-india.azure",
        warehouse="COMPUTE_WH",
        database="Main_Project",
        schema="Main_Project_Schema_Mart",
        role="ACCOUNTADMIN"
    )

    # Print connection success message
    print("Connected to Snowflake successfully!")

except snowflake.connector.errors.DatabaseError as e:
    # Print connection failure message
    print(f"Failed to connect to Snowflake: {e}")

# Fetch data into a pandas DataFrame
cur = conn.cursor()
sql_query = f"SELECT * FROM TOTALINFO"
cur.execute(sql_query)
data = cur.fetchall()
columns = [col[0] for col in cur.description]
df = pd.DataFrame(data, columns=columns)
cur.close()
conn.close()

# Preprocess the DataFrame
df['SCOREACHIEVEDINQUIZ'] = df['SCOREACHIEVEDINQUIZ'].fillna(df['SCOREACHIEVEDINQUIZ'].mean()).astype(int)
df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'] = df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].fillna(df['ASSESSMENT_COMPLETION_TIME_IN_HOURS'].mean()).astype(int)
df['SPECIALIZATION'] = df['SPECIALIZATION'].str.replace(r"[\[\],]", '', regex=True)
df1 = df[['EXPERIENCE', 'OPTIMIZEDDURATION', 'SCOREACHIEVEDINQUIZ', 'ASSESSMENT_COMPLETION_TIME_IN_HOURS']]

# Separate features and target variable
X = df1[['EXPERIENCE', 'OPTIMIZEDDURATION']]
y = df1['SCOREACHIEVEDINQUIZ']

# Impute missing values in X using mean imputation
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize KNN regressor
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (accuracy for regression)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (Accuracy for regression): {r2}")

# Save the trained model using pickle
with open('knn_regressor_model1.pkl', 'wb') as file:
    pickle.dump(knn, file)

print("KNN regressor model saved as 'knn_regressor_model.pkl'")


Connected to Snowflake successfully!
Mean Squared Error: 220.31429262394195
R-squared (Accuracy for regression): 0.16512314881391077
KNN regressor model saved as 'knn_regressor_model.pkl'


In [43]:
import pandas as pd
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# Dummy data
dummy_data = {
    'EXPERIENCE': [5],
    'OPTIMIZEDDURATION': [8]
}

# Create a DataFrame from the dummy data
dummy_df = pd.DataFrame(dummy_data)

# Preprocess the dummy DataFrame
# Fill missing values in numeric columns with the mean
numeric_cols = ['EXPERIENCE', 'OPTIMIZEDDURATION']
imputer = SimpleImputer(strategy='mean')
dummy_df[numeric_cols] = imputer.fit_transform(dummy_df[numeric_cols])


# Reorder columns to match the trained model's input
dummy_df = dummy_df.reindex(columns=X.columns, fill_value=0)

# Make predictions using the trained XGBoost regressor
y_pred_dummy = knn.predict(dummy_df)

print("Predicted ASSESSMENT_COMPLETION_TIME_IN_HOURS:", y_pred_dummy)



Predicted ASSESSMENT_COMPLETION_TIME_IN_HOURS: [29.]


