<a href="https://colab.research.google.com/github/JJader/api-frontend/blob/main/notebook/modelagem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U pandas==1.5.3 &> /dev/null
!pip install -q findspark pyspark "mlflow==2.15.1" &> /dev/null

In [None]:
import os
import findspark

In [None]:

# instalar as dependências do spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# !wget  https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar xf /content/drive/MyDrive/Colab\ Notebooks/spark-3.1.2-bin-hadoop2.7.tgz

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
findspark.init()

# Sessão

In [None]:
# iniciar uma sessão local
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, count, date_format, stddev, expr, first, regexp_replace

#sc = SparkSession.builder.master('local[*]').getOrCreate()
spark = SparkSession.builder.appName("Introducao").getOrCreate()
spark

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType, DateType
from pyspark.ml.feature import StringIndexer


df_spark = spark.read.csv("/content/drive/MyDrive/picpay/df_with_date_coords.csv", header=True)
df_spark = df_spark.na.drop()
df_select_columns = df_spark.select([
    "dep_delay",
    "origin",
    "dest",
    "distance",
    "arr_delay",
    "month",
])

In [None]:
df = df_select_columns.toPandas()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import numpy as np

# Separar as variáveis independentes (X) e a variável dependente (y)
X = df[['dep_delay', 'origin', 'dest', 'distance', 'month']].reset_index(drop=True)
y = df['arr_delay'].reset_index(drop=True)

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir transformações para cada tipo de variável
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['dep_delay', 'distance', 'month']),
        ('cat', OrdinalEncoder(), ['origin', 'dest'])
    ])

# Criar o pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Fazer previsões
y_pred = pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R^2 Score: {r2}')

# Salvar o modelo em um arquivo .pkl
joblib.dump(pipeline, '/content/drive/MyDrive/picpay/modelo_linear_regression.pkl')


Mean Squared Error (MSE): 323.7615378265965
Root Mean Squared Error (RMSE): 17.993374831492744
Mean Absolute Error (MAE): 13.084226484251568
R^2 Score: 0.8422864107566785


['/content/drive/MyDrive/picpay/modelo_linear_regression.pkl']

In [None]:
data = {
    'dep_delay': [15.0],
    'origin': ['JFK'],
    'dest': ['LAX'],
    'distance': [2475.0],
    'month': [8]
}

# Criar um DataFrame a partir dos dados
input_df = pd.DataFrame(data)
pipeline.predict(input_df)

array([6.0394353])