In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
query = "SELECT * FROM AIRLINE_ENRICHED"
flightsinfo2 = session.sql(query).to_pandas()

# Use pandas describe method for descriptive statistics
summary = flightsinfo2.describe()

# Display the summary statistics
print(summary)

In [None]:
data_vizual = flightsinfo2[['AIRLINE', 'AIRLINE_NAME', 'ORG_AIRPORT_NAME', 'ORGIN_CITY',
'DEST_AIRPORT_NAME', 'DESTINATION_CITY', 'ORIGIN_AIRPORT',
'DESTINATION_AIRPORT', 'DISTANCE', 'Actual_Departure', 'DATE', 'WEEK',
'Scheduled_Departure', 'DEPARTURE_DELAY', 'Actual_Arrival',
'Scheduled_Arrival', 'ARRIVAL_DELAY', 'SCHEDULED_TIME', 'ELAPSED_TIME',
'AIR_TIME', 'TAXI_IN', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_DEPARTURE',
'DEPARTURE_TIME', 'TIME_OF_DAY']]

In [None]:
Flights = data_vizual.copy()

In [None]:
# Removing columns that are not needed for prediction
Flights1 = Flights.drop([
    'AIRLINE_NAME', 
    'ORG_AIRPORT_NAME', 
    'ORGIN_CITY',
    'DEST_AIRPORT_NAME', 
    'DESTINATION_CITY', 
    'DISTANCE', 
    'Actual_Departure', 
    'Scheduled_Departure', 
    'Actual_Arrival',
    'Scheduled_Arrival', 
    'SCHEDULED_TIME', 
    'ELAPSED_TIME',  
    'AIR_TIME', 'TIME_OF_DAY'
], axis=1)

In [None]:
import numpy as np
Flights1['IS_DELAYED'] = np.where(Flights1['ARRIVAL_DELAY']<=0, 0,1)

In [None]:
Flights1.columns

In [None]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Columns to encode
columns_to_encode = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'WEEK']

# Dictionary to store the LabelEncoders and mappings
label_encoders = {}
mappings = {}

# Apply LabelEncoder to selective columns
for column in columns_to_encode:
    # Initialize a LabelEncoder for the column
    encoder = LabelEncoder()
    # Fit and transform the column in the dataset
    Flights1[column] = encoder.fit_transform(Flights1[column])
    # Store the LabelEncoder instance in the dictionary
    label_encoders[column] = encoder
    # Store the mapping (class to label) in the mappings dictionary
    mappings[column] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

# Display the mappings
print("\nMappings:")
for column, mapping in mappings.items():
    print(f"{column}_mapping: {mapping}\n")
print("-------------------------------------")

In [None]:
import joblib
joblib.dump(label_encoders, 'label_encoder_complete.joblib')

In [None]:
X = Flights1.drop(['ARRIVAL_DELAY','IS_DELAYED','DATE'],axis = 1)
X.shape
y = Flights1['IS_DELAYED']
y.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 2)

In [None]:
y_train.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

In [None]:
import pandas as pd
pd.Series(y_train).value_counts()

In [None]:
#Applying Standard Scalar for classification
sc1=StandardScaler()
X_train_sc=sc1.fit_transform(X_train)
X_test_sc=sc1.transform(X_test)

In [None]:
session.write_pandas(
    Flights1, 
    "FLIGHTDELAY_MODEL_DATA", 
    auto_create_table=True)

In [None]:
SELECT * FROM FLIGHTDELAY_MODEL_DATA

In [None]:
ALTER TABLE FLIGHTDELAY_MODEL_DATA ADD COLUMN converted_date TIMESTAMP_NTZ;


In [None]:
UPDATE FLIGHTDELAY_MODEL_DATA
SET converted_date = TO_TIMESTAMP_NTZ(date / 1e9);

In [None]:
ALTER TABLE FLIGHTDELAY_MODEL_DATA DROP COLUMN DATE;
ALTER TABLE FLIGHTDELAY_MODEL_DATA RENAME COLUMN converted_date TO DATE;


In [None]:
from sklearn.tree import DecisionTreeClassifier
# Initialize the DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion = 'entropy', random_state = 5)
# Fit the classifier on the training data
classifierDT.fit(X_train_sc, y_train)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# Predicting the Test set results
y_pred_DT = classifierDT.predict(X_test_sc)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_DT)
print(f"Confusion Matrix :\n {cm}")
#Calculating the accuracy
score_DT = classifierDT.score(X_test_sc,y_test)
print(f"Accuracy : {score_DT}")

In [None]:
# Calculating F1 score,Precision,Recall of model
F1_score_DT = f1_score(y_test, y_pred_DT, average="macro")
Precision_DT = precision_score(y_test, y_pred_DT, average="macro")
Recall_DT = recall_score(y_test, y_pred_DT, average="macro")
print("F1 score :",F1_score_DT)
print("Precision Score :" , Precision_DT)
print("Recall Score :" , Recall_DT)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Function to format tick labels in decimal notation
def format_func(value, tick_number):
    return f'{int(value):,}'
# Assuming `cm` is your confusion matrix
# Set display labels to your desired labels
labels = ['Not Delayed', 'Delayed']
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
# Plot the confusion matrix
disp.plot(cmap=plt.cm.Greens, values_format='d')
# Set the title of the plot
plt.title('Confusion Matrix')
# Apply custom formatting to both x and y axis tick labels
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func))
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))
# Manually set the tick labels
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(labels)
plt.gca().set_yticks([0, 1])
plt.gca().set_yticklabels(labels)
# Show the plot
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Initialize the KNeighborsClassifier
classifier_Knn = KNeighborsClassifier(
n_neighbors=5,
algorithm='auto', 
leaf_size=40, # Larger leaf size may speed up the training time
metric='euclidean',
n_jobs=-1 # Use all available cores for parallel processing
)
# Fit the classifier on the training data
classifier_Knn.fit(X_train_sc,y_train)

In [None]:
# Predicting the Test set results
y_pred_knn=classifier_Knn.predict(X_test_sc)
from sklearn.metrics import confusion_matrix
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_knn)
print(f"Confusion Matrix :\n {cm}")
#Calculating the accuracy
score_knn = classifier_Knn.score(X_test_sc,y_test)
print(f"Accuracy : {score_knn}")

In [None]:
# Calculating F1 score,Precision,Recall of model
F1_score_knn = f1_score(y_test, y_pred_knn, average="macro")
Precision_knn = precision_score(y_test, y_pred_knn, average="macro")
Recall_knn = recall_score(y_test, y_pred_knn, average="macro")
print("F1 score :",F1_score_knn)
print("Precision Score :",Precision_knn)
print("Recall Score :",Recall_knn)

In [None]:
# Function to format tick labels in decimal notation
def format_func(value, tick_number):
    return f'{int(value):,}'
# Assuming `cm` is your confusion matrix
# Set display labels to your desired labels
labels = ['Not Delayed', 'Delayed']
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
# Plot the confusion matrix
disp.plot(cmap=plt.cm.Greens, values_format='d')
# Set the title of the plot
plt.title('Confusion Matrix')
# Apply custom formatting to both x and y axis tick labels
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func))
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))
# Manually set the tick labels
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(labels)
plt.gca().set_yticks([0, 1])
plt.gca().set_yticklabels(labels)
# Show the plot
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression(random_state = 0)
classifier_log.fit(X_train_sc, y_train)

In [None]:
y_pred_log = classifier_log.predict(X_test_sc)
cm = confusion_matrix(y_test, y_pred_log)
print(f"Confusion Matrix :\n {cm}")
#Calculating the accuracy
score_log = classifier_log.score(X_test_sc,y_test)
print(f"Accuracy : {score_log}")

In [None]:
F1_score_log = f1_score(y_test, y_pred_log, average="macro")
Precision_log = precision_score(y_test, y_pred_log, average="macro")
Recall_log = recall_score(y_test, y_pred_log, average="macro")
print("F1 score :",F1_score_log)
print("Precision Score :",Precision_log)
print("Recall Score :",Recall_log)

In [None]:
# Function to format tick labels in decimal notation
def format_func(value, tick_number):
    return f'{int(value):,}'
# Assuming `cm` is your confusion matrix
# Set display labels to your desired labels
labels = ['Not Delayed', 'Delayed']
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
# Plot the confusion matrix
disp.plot(cmap=plt.cm.Greens, values_format='d')
# Set the title of the plot
plt.title('Confusion Matrix')
# Apply custom formatting to both x and y axis tick labels
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func))
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))
# Manually set the tick labels
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(labels)
plt.gca().set_yticks([0, 1])
plt.gca().set_yticklabels(labels)
# Show the plot
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize the RandomForestClassifier
classifier_RF = RandomForestClassifier(
random_state=0, # Seed for the random number generator to ensure reproducibility
n_jobs=-1, # Use all available cores for parallel processing
n_estimators=100, # Number of trees in the forest; fewer trees reduce model size
max_depth=10 # Maximum depth of each tree; shallower trees are smaller
)
# Fit the classifier on the training data
classifier_RF.fit(X_train_sc, y_train)

In [None]:
# Predicting the Test set results
y_pred_RF = classifier_RF.predict(X_test_sc)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_RF)
print(f"Confusion Matrix :\n {cm}")
#Calculating the accuracy
score_RF = classifierDT.score(X_test_sc,y_test)
print(f"Accuracy : {score_RF}")

In [None]:
# Calculating F1 score,Precision,Recall of model
F1_score_RF = f1_score(y_test, y_pred_RF, average="macro")
Precision_RF = precision_score(y_test, y_pred_RF, average="macro")
Recall_RF = recall_score(y_test, y_pred_RF, average="macro")
print("F1 score :",F1_score_RF)
print("Precision Score :",Precision_RF)
print("Recall Score :",Recall_RF)

In [None]:
# Function to format tick labels in decimal notation
def format_func(value, tick_number):
    return f'{int(value):,}'
# Assuming `cm` is your confusion matrix
# Set display labels to your desired labels
labels = ['Not Delayed', 'Delayed']
# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
# Plot the confusion matrix
disp.plot(cmap=plt.cm.Greens, values_format='d')
# Set the title of the plot
plt.title('Confusion Matrix')
# Apply custom formatting to both x and y axis tick labels
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_func))
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_func))
# Manually set the tick labels
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(labels)
plt.gca().set_yticks([0, 1])
plt.gca().set_yticklabels(labels)
# Show the plot
plt.show()

In [None]:
# Define the metrics Table of each Classification model for comparision
cl_metrics = {
'Accuracy': [score_DT,score_knn,score_log,score_RF],
'F1-Score': [F1_score_DT,F1_score_knn,F1_score_log,F1_score_RF],
'Precision': [Precision_DT,Precision_knn,Precision_log,Precision_RF],
'Recall': [Recall_DT,Recall_knn,Recall_log,Recall_RF]
}
# Create a DataFrame from the metrics with models as the index
cl_metrics_df = pd.DataFrame(cl_metrics, 
                             index=['DecisionTree', 'K-Nearest Neighbor', 'LogisticRegression','RandomForcl_metrics_df'])

In [None]:
joblib.dump(classifier_log, 'logistic_regression_model.joblib')

In [None]:
CREATE STAGE modelstage

In [None]:
session.file.put('logistic_regression_model.joblib', "@modelstage") 

In [None]:
LIST @modelstage

In [None]:
cl_metrics_df

In [None]:
import os
import joblib

# Ensure the directory exists
os.makedirs('models', exist_ok=True)

# Save the model
joblib.dump(classifier_log, 'models/logistic_regression_model.joblib')
print("Model saved successfully!")


In [None]:
import os

file_path = '@modelstage/logistic_regression_model.joblib.gz'
file_exists = os.path.exists(file_path)

if file_exists:
    print("File exists.")
else:
    print("File does not exist.")


In [None]:
# Check if the file exists in the Snowflake stage
stage_path = '@modelstage/logistic_regression_model.joblib.gz'
query = f"SELECT COUNT(*) FROM @{stage_path}"


In [None]:
import os

current_directory = os.getcwd()
print("Current directory:", current_directory)
